WIP switch writeBlocks to take a slice pointer
diff --git a/xxhash.go b/xxhash.go
index 7ab754e..309717f 100644
--- a/xxhash.go
+++ b/xxhash.go
@@ -133,7 +133,7 @@
 
 	if len(b) >= 32 {
 		// One or more full blocks left.
-		b = writeBlocks(x, b)
+		writeBlocks(x, &b)
 	}
 
 	// Store any remaining partial block.
@@ -143,8 +143,9 @@
 	return
 }
 
-func writeBlocksGo(x *xxh, b []byte) []byte {
+func writeBlocksGo(x *xxh, bp *[]byte) {
 	v1, v2, v3, v4 := x.v1, x.v2, x.v3, x.v4
+	b := *bp
 	for len(b) >= 32 {
 		v1 = round(v1, u64(b[0:8:len(b)]))
 		v2 = round(v2, u64(b[8:16:len(b)]))
@@ -153,7 +154,7 @@
 		b = b[32:len(b):len(b)]
 	}
 	x.v1, x.v2, x.v3, x.v4 = v1, v2, v3, v4
-	return b
+	*bp = b
 }
 
 func (x *xxh) Sum(b []byte) []byte {
diff --git a/xxhash_amd64.go b/xxhash_amd64.go
index e294303..e1335d9 100644
--- a/xxhash_amd64.go
+++ b/xxhash_amd64.go
@@ -6,4 +6,4 @@
 
 func sum64(b []byte) uint64
 
-func writeBlocks(x *xxh, b []byte) []byte
+func writeBlocks(x *xxh, bp *[]byte)
diff --git a/xxhash_amd64.s b/xxhash_amd64.s
index c49483d..54043eb 100644
--- a/xxhash_amd64.s
+++ b/xxhash_amd64.s
@@ -170,17 +170,18 @@
 	RET
 
 // writeBlocks uses the same registers as above except that it uses AX to store
-// the x pointer.
+// the x pointer and R15 to store the bp pointer.
 
-// func writeBlocks(x *xxh, b []byte) []byte
-TEXT ·writeBlocks(SB), NOSPLIT, $0-56
+// func writeBlocks(x *xxh, bp *[]byte)
+TEXT ·writeBlocks(SB), NOSPLIT, $0-16
 	// Load fixed primes needed for round.
 	MOVQ ·prime1(SB), R13
 	MOVQ ·prime2(SB), R14
 
 	// Load slice.
-	MOVQ b_base+8(FP), CX
-	MOVQ b_len+16(FP), DX
+	MOVQ bp+8(FP), R15
+	MOVQ (R15), CX      // base
+	MOVQ 8(R15), DX     // len
 	LEAQ (CX)(DX*1), BX
 	SUBQ $32, BX
 
@@ -208,15 +209,15 @@
 	MOVQ R10, 16(AX)
 	MOVQ R11, 24(AX)
 
-	// Construct return slice.
-	MOVQ CX, ret+32(FP)
+	// Write result slice.
+	MOVQ CX, (R15)
 
 	// New length is 32 - (CX - BX) -> BX+32 - CX.
 	ADDQ $32, BX
 	SUBQ CX, BX
-	MOVQ BX, ret+40(FP)
+	MOVQ BX, 8(R15)
 
 	// Set the cap same as length.
-	MOVQ BX, ret+48(FP)
+	MOVQ BX, 16(R15)
 
 	RET
diff --git a/xxhash_amd64_test.go b/xxhash_amd64_test.go
index 330ce96..a9f1495 100644
--- a/xxhash_amd64_test.go
+++ b/xxhash_amd64_test.go
@@ -27,17 +27,18 @@
 	x0 := New().(*xxh)
 	x1 := New().(*xxh)
 	for i := 32; i < 500; i++ {
-		b := make([]byte, i)
-		for j := range b {
-			b[j] = byte(j)
+		b0 := make([]byte, i)
+		for j := range b0 {
+			b0[j] = byte(j)
 		}
-		pureGo := writeBlocksGo(x0, b)
-		asm := writeBlocks(x1, b)
-		if !reflect.DeepEqual(pureGo, asm) {
-			t.Fatalf("[i=%d] pure go gave %v; asm gave %v", i, pureGo, asm)
+		b1 := b0
+		writeBlocksGo(x0, &b0)
+		writeBlocks(x1, &b1)
+		if !reflect.DeepEqual(b0, b1) {
+			t.Fatalf("[i=%d] pure go gave %v; b1 gave %v", i, b0, b1)
 		}
 		if !reflect.DeepEqual(x0, x1) {
-			t.Fatalf("[i=%d] pure go had state %v; asm had state %v", i, x0, x1)
+			t.Fatalf("[i=%d] pure go had state %v; b1 had state %v", i, x0, x1)
 		}
 	}
 }