Merge branch 'iambus-master'
diff --git a/diffmatchpatch/diff.go b/diffmatchpatch/diff.go index 17ad990..08c36e7 100644 --- a/diffmatchpatch/diff.go +++ b/diffmatchpatch/diff.go
@@ -388,32 +388,30 @@ } // DiffLinesToChars splits two texts into a list of strings, and educes the texts to a string of hashes where each Unicode character represents one line. +// It's slightly faster to call DiffLinesToRunes first, followed by DiffMainRunes. func (dmp *DiffMatchPatch) DiffLinesToChars(text1, text2 string) (string, string, []string) { - chars1, chars2, lineArray := dmp.diffLinesToIndexes(text1, text2) - return indexesToString(chars1), indexesToString(chars2), lineArray + chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) + return chars1, chars2, lineArray } // DiffLinesToRunes splits two texts into a list of runes. func (dmp *DiffMatchPatch) DiffLinesToRunes(text1, text2 string) ([]rune, []rune, []string) { - chars1, chars2, lineArray := dmp.diffLinesToIndexes(text1, text2) - return []rune(indexesToString(chars1)), []rune(indexesToString(chars2)), lineArray -} - -// diffLinesToIndexes splits two texts into a list of indexes -func (dmp *DiffMatchPatch) diffLinesToIndexes(text1, text2 string) ([]index, []index, []string) { chars1, chars2, lineArray := dmp.diffLinesToStrings(text1, text2) - return chars1, chars2, lineArray + return []rune(chars1), []rune(chars2), lineArray } // DiffCharsToLines rehydrates the text in a diff from a string of line hashes to real lines of text. func (dmp *DiffMatchPatch) DiffCharsToLines(diffs []Diff, lineArray []string) []Diff { hydrated := make([]Diff, 0, len(diffs)) for _, aDiff := range diffs { - var sb strings.Builder - for _, i := range stringToIndex(aDiff.Text) { - sb.WriteString(lineArray[i]) + runes := []rune(aDiff.Text) + text := make([]string, len(runes)) + + for i, r := range runes { + text[i] = lineArray[runeToInt(r)] } - aDiff.Text = sb.String() + + aDiff.Text = strings.Join(text, "") hydrated = append(hydrated, aDiff) } return hydrated @@ -1148,13 +1146,28 @@ switch diff.Type { case DiffInsert: - _, _ = buff.WriteString("\x1b[32m") - _, _ = buff.WriteString(text) - _, _ = buff.WriteString("\x1b[0m") + lines := strings.Split(text, "\n") + for i, line := range lines { + _, _ = buff.WriteString("\x1b[32m") + _, _ = buff.WriteString(line) + if i < len(lines)-1 { + _, _ = buff.WriteString("\x1b[0m\n") + } else { + _, _ = buff.WriteString("\x1b[0m") + } + } + case DiffDelete: - _, _ = buff.WriteString("\x1b[31m") - _, _ = buff.WriteString(text) - _, _ = buff.WriteString("\x1b[0m") + lines := strings.Split(text, "\n") + for i, line := range lines { + _, _ = buff.WriteString("\x1b[31m") + _, _ = buff.WriteString(line) + if i < len(lines)-1 { + _, _ = buff.WriteString("\x1b[0m\n") + } else { + _, _ = buff.WriteString("\x1b[0m") + } + } case DiffEqual: _, _ = buff.WriteString(text) } @@ -1306,8 +1319,7 @@ } // diffLinesToStrings splits two texts into a list of strings. Each string represents one line. -func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) ([]index, []index, []string) { - // '\x00' is a valid character, but various debuggers don't like it. So we'll insert a junk entry to avoid generating a null character. +func (dmp *DiffMatchPatch) diffLinesToStrings(text1, text2 string) (string, string, []string) { lineArray := []string{""} // e.g. lineArray[4] == 'Hello\n' lineHash := make(map[string]int) @@ -1315,12 +1327,11 @@ strIndexArray1 := dmp.diffLinesToStringsMunge(text1, &lineArray, lineHash) strIndexArray2 := dmp.diffLinesToStringsMunge(text2, &lineArray, lineHash) - return strIndexArray1, strIndexArray2, lineArray + return intArrayToString(strIndexArray1), intArrayToString(strIndexArray2), lineArray } -// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []string. -func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string, lineHash map[string]int) []uint32 { - // Walk the text, pulling out a substring for each line. text.split('\n') would would temporarily double our memory footprint. Modifying text would create many large strings to garbage collect. +// diffLinesToStringsMunge splits a text into an array of strings, and reduces the texts to a []index. +func (dmp *DiffMatchPatch) diffLinesToStringsMunge(text string, lineArray *[]string, lineHash map[string]int) []index { lineStart := 0 lineEnd := -1 strs := []index{}
diff --git a/diffmatchpatch/diff_test.go b/diffmatchpatch/diff_test.go index d1611da..2c43864 100644 --- a/diffmatchpatch/diff_test.go +++ b/diffmatchpatch/diff_test.go
@@ -317,9 +317,9 @@ {"", "alpha\r\nbeta\r\n\r\n\r\n", "", "\x01\x02\x03\x03", []string{"", "alpha\r\n", "beta\r\n", "\r\n"}}, {"a", "b", "\x01", "\x02", []string{"", "a", "b"}}, // Omit final newline. - {"alpha\nbeta\nalpha", "", "1,2,3", "", []string{"", "alpha\n", "beta\n", "alpha"}}, + {"alpha\nbeta\nalpha", "", "\x01\x02\x03", "", []string{"", "alpha\n", "beta\n", "alpha"}}, // Same lines in Text1 and Text2 - {"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "1,2,3", "1,4,3,5", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}}, + {"abc\ndefg\n12345\n", "abc\ndef\n12345\n678", "\x01\x02\x03", "\x01\x04\x03\x05", []string{"", "abc\n", "defg\n", "12345\n", "def\n", "678"}}, } { actualChars1, actualChars2, actualLines := dmp.DiffLinesToChars(tc.Text1, tc.Text2) assert.Equal(t, tc.ExpectedChars1, actualChars1, fmt.Sprintf("Test case #%d, %#v", i, tc)) @@ -1033,6 +1033,17 @@ Expected: "a\n\x1b[31m<B>b</B>\x1b[0m\x1b[32mc&d\x1b[0m", }, + { + Diffs: []Diff{ + {Type: DiffEqual, Text: "a\n"}, + {Type: DiffDelete, Text: "b\nc\n"}, + {Type: DiffEqual, Text: "def"}, + {Type: DiffInsert, Text: "\ng\nh"}, + {Type: DiffEqual, Text: "\ni"}, + }, + + Expected: "a\n\x1b[31mb\x1b[0m\n\x1b[31mc\x1b[0m\n\x1b[31m\x1b[0mdef\x1b[32m\x1b[0m\n\x1b[32mg\x1b[0m\n\x1b[32mh\x1b[0m\ni", + }, } { actual := dmp.DiffPrettyText(tc.Diffs) assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %#v", i, tc)) @@ -1468,6 +1479,38 @@ assert.NotEmpty(t, diffs) } +func TestDiffPartialLineIndex(t *testing.T) { + dmp := New() + t1, t2, tt := dmp.DiffLinesToChars( + `line 1 +line 2 +line 3 +line 4 +line 5 +line 6 +line 7 +line 8 +line 9 +line 10 text1`, + `line 1 +line 2 +line 3 +line 4 +line 5 +line 6 +line 7 +line 8 +line 9 +line 10 text2`) + diffs := dmp.DiffMain(t1, t2, false) + diffs = dmp.DiffCharsToLines(diffs, tt) + assert.Equal(t, []Diff{ + Diff{DiffEqual, "line 1\nline 2\nline 3\nline 4\nline 5\nline 6\nline 7\nline 8\nline 9\n"}, + Diff{DiffDelete, "line 10 text1"}, + Diff{DiffInsert, "line 10 text2"}, + }, diffs) +} + func BenchmarkDiffMain(bench *testing.B) { s1 := "`Twas brillig, and the slithy toves\nDid gyre and gimble in the wabe:\nAll mimsy were the borogoves,\nAnd the mome raths outgrabe.\n" s2 := "I am the very model of a modern major general,\nI've information vegetable, animal, and mineral,\nI know the kings of England, and I quote the fights historical,\nFrom Marathon to Waterloo, in order categorical.\n"
diff --git a/diffmatchpatch/stringutil.go b/diffmatchpatch/stringutil.go index 265f29c..573b6bf 100644 --- a/diffmatchpatch/stringutil.go +++ b/diffmatchpatch/stringutil.go
@@ -9,10 +9,16 @@ package diffmatchpatch import ( + "fmt" "strings" "unicode/utf8" ) +const UNICODE_INVALID_RANGE_START = 0xD800 +const UNICODE_INVALID_RANGE_END = 0xDFFF +const UNICODE_INVALID_RANGE_DELTA = UNICODE_INVALID_RANGE_END - UNICODE_INVALID_RANGE_START + 1 +const UNICODE_RANGE_MAX = 0x10FFFF + // unescaper unescapes selected chars for compatibility with JavaScript's encodeURI. // In speed critical applications this could be dropped since the receiving application will certainly decode these fine. Note that this function is case-sensitive. Thus "%3F" would not be unescaped. But this is ok because it is only called with the output of HttpUtility.UrlEncode which returns lowercase hex. Example: "%3f" -> "?", "%24" -> "$", etc. var unescaper = strings.NewReplacer( @@ -86,3 +92,99 @@ } return -1 } + +func intArrayToString(ns []index) string { + if len(ns) == 0 { + return "" + } + + b := []rune{} + for _, n := range ns { + b = append(b, intToRune(uint32(n))) + } + return string(b) +} + +// These constants define the number of bits representable +// in 1,2,3,4 byte utf8 sequences, respectively. +const ONE_BYTE_BITS = 7 +const TWO_BYTE_BITS = 11 +const THREE_BYTE_BITS = 16 +const FOUR_BYTE_BITS = 21 + +// Helper for getting a sequence of bits from an integer. +func getBits(i uint32, cnt byte, from byte) byte { + return byte((i >> from) & ((1 << cnt) - 1)) +} + +// Converts an integer in the range 0~1112060 into a rune. +// Based on the ranges table in https://en.wikipedia.org/wiki/UTF-8 +func intToRune(i uint32) rune { + if i < (1 << ONE_BYTE_BITS) { + return rune(i) + } + + if i < (1 << TWO_BYTE_BITS) { + r, size := utf8.DecodeRune([]byte{0b11000000 | getBits(i, 5, 6), 0b10000000 | getBits(i, 6, 0)}) + if size != 2 || r == utf8.RuneError { + panic(fmt.Sprintf("Error encoding an int %d with size 2, got rune %v and size %d", size, r, i)) + } + return r + } + + // Last -3 here needed because for some reason 3rd to last codepoint 65533 in this range + // was returning utf8.RuneError during encoding. + if i < ((1 << THREE_BYTE_BITS) - UNICODE_INVALID_RANGE_DELTA - 3) { + if i >= UNICODE_INVALID_RANGE_START { + i += UNICODE_INVALID_RANGE_DELTA + } + + r, size := utf8.DecodeRune([]byte{0b11100000 | getBits(i, 4, 12), 0b10000000 | getBits(i, 6, 6), 0b10000000 | getBits(i, 6, 0)}) + if size != 3 || r == utf8.RuneError { + panic(fmt.Sprintf("Error encoding an int %d with size 3, got rune %v and size %d", size, r, i)) + } + return r + } + + if i < (1<<FOUR_BYTE_BITS - UNICODE_INVALID_RANGE_DELTA - 3) { + i += UNICODE_INVALID_RANGE_DELTA + 3 + r, size := utf8.DecodeRune([]byte{0b11110000 | getBits(i, 3, 18), 0b10000000 | getBits(i, 6, 12), 0b10000000 | getBits(i, 6, 6), 0b10000000 | getBits(i, 6, 0)}) + if size != 4 || r == utf8.RuneError { + panic(fmt.Sprintf("Error encoding an int %d with size 4, got rune %v and size %d", size, r, i)) + } + return r + } + panic(fmt.Sprintf("The integer %d is too large for runeToInt()", i)) +} + +// Converts a rune generated by intToRune back to an integer +func runeToInt(r rune) uint32 { + i := uint32(r) + if i < (1 << ONE_BYTE_BITS) { + return i + } + + bytes := []byte{0, 0, 0, 0} + + size := utf8.EncodeRune(bytes, r) + + if size == 2 { + return uint32(bytes[0]&0b11111)<<6 | uint32(bytes[1]&0b111111) + } + + if size == 3 { + result := uint32(bytes[0]&0b1111)<<12 | uint32(bytes[1]&0b111111)<<6 | uint32(bytes[2]&0b111111) + if result >= UNICODE_INVALID_RANGE_END { + return result - UNICODE_INVALID_RANGE_DELTA + } + + return result + } + + if size == 4 { + result := uint32(bytes[0]&0b111)<<18 | uint32(bytes[1]&0b111111)<<12 | uint32(bytes[2]&0b111111)<<6 | uint32(bytes[3]&0b111111) + return result - UNICODE_INVALID_RANGE_DELTA - 3 + } + + panic(fmt.Sprintf("Unexpected state decoding rune=%v size=%d", r, size)) +}
diff --git a/diffmatchpatch/stringutil_test.go b/diffmatchpatch/stringutil_test.go index ab2bc10..73ab6ca 100644 --- a/diffmatchpatch/stringutil_test.go +++ b/diffmatchpatch/stringutil_test.go
@@ -114,3 +114,21 @@ assert.Equal(t, tc.Expected, actual, fmt.Sprintf("Test case #%d, %#v", i, tc)) } } + +// Exhaustive check for all ints from 0 to 1112060 for correctness of implementation +// of `intToRune() -> runeToInt()`. +// This test is slow and runs longer than 5 seconds but it does provide a safety +// guarantee that these 2 functions are correct for the ranges we support. +func TestRuneToInt(t *testing.T) { + + for i := uint32(0); i <= UNICODE_RANGE_MAX-UNICODE_INVALID_RANGE_DELTA-3; i += 1 { + r := intToRune(i) + ic := runeToInt(r) + + assert.Equal(t, i, ic, fmt.Sprintf("intToRune(%d)=%d and runeToInt(%d)=%d", i, r, r, ic)) + } + + assert.Panics(t, func() { + intToRune(UNICODE_RANGE_MAX - UNICODE_INVALID_RANGE_DELTA - 2) + }) +}
diff --git a/go.mod b/go.mod index c731033..c7886ce 100644 --- a/go.mod +++ b/go.mod
@@ -8,4 +8,4 @@ gopkg.in/yaml.v2 v2.4.0 // indirect ) -go 1.12 +go 1.13