| // Copyright 2011 the V8 project authors. All rights reserved. |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| |
| #ifndef SRC_STRING_SEARCH_H_ |
| #define SRC_STRING_SEARCH_H_ |
| |
| #if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS |
| |
| #include "node_internals.h" |
| #include <string.h> |
| #include <algorithm> |
| |
| namespace node { |
| namespace stringsearch { |
| |
| static const uint32_t kMaxOneByteCharCodeU = 0xff; |
| |
| template <typename T> |
| class Vector { |
| public: |
| Vector(T* data, size_t length, bool isForward) |
| : start_(data), length_(length), is_forward_(isForward) { |
| CHECK(length > 0 && data != nullptr); |
| } |
| |
| // Returns the start of the memory range. |
| // For vector v this is NOT necessarily &v[0], see forward(). |
| const T* start() const { return start_; } |
| |
| // Returns the length of the vector, in characters. |
| size_t length() const { return length_; } |
| |
| // Returns true if the Vector is front-to-back, false if back-to-front. |
| // In the latter case, v[0] corresponds to the *end* of the memory range. |
| size_t forward() const { return is_forward_; } |
| |
| // Access individual vector elements - checks bounds in debug mode. |
| T& operator[](size_t index) const { |
| #ifdef DEBUG |
| CHECK(index < length_); |
| #endif |
| return start_[is_forward_ ? index : (length_ - index - 1)]; |
| } |
| |
| private: |
| T* start_; |
| size_t length_; |
| bool is_forward_; |
| }; |
| |
| |
| //--------------------------------------------------------------------- |
| // String Search object. |
| //--------------------------------------------------------------------- |
| |
| // Class holding constants and methods that apply to all string search variants, |
| // independently of subject and pattern char size. |
| class StringSearchBase { |
| protected: |
| // Cap on the maximal shift in the Boyer-Moore implementation. By setting a |
| // limit, we can fix the size of tables. For a needle longer than this limit, |
| // search will not be optimal, since we only build tables for a suffix |
| // of the string, but it is a safe approximation. |
| static const int kBMMaxShift = 250; |
| |
| // Reduce alphabet to this size. |
| // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size |
| // proportional to the input alphabet. We reduce the alphabet size by |
| // equating input characters modulo a smaller alphabet size. This gives |
| // a potentially less efficient searching, but is a safe approximation. |
| // For needles using only characters in the same Unicode 256-code point page, |
| // there is no search speed degradation. |
| static const int kLatin1AlphabetSize = 256; |
| static const int kUC16AlphabetSize = 256; |
| |
| // Bad-char shift table stored in the state. It's length is the alphabet size. |
| // For patterns below this length, the skip length of Boyer-Moore is too short |
| // to compensate for the algorithmic overhead compared to simple brute force. |
| static const int kBMMinPatternLength = 8; |
| |
| // Store for the BoyerMoore(Horspool) bad char shift table. |
| int bad_char_shift_table_[kUC16AlphabetSize]; |
| // Store for the BoyerMoore good suffix shift table. |
| int good_suffix_shift_table_[kBMMaxShift + 1]; |
| // Table used temporarily while building the BoyerMoore good suffix |
| // shift table. |
| int suffix_table_[kBMMaxShift + 1]; |
| }; |
| |
| template <typename Char> |
| class StringSearch : private StringSearchBase { |
| public: |
| typedef stringsearch::Vector<const Char> Vector; |
| |
| explicit StringSearch(Vector pattern) |
| : pattern_(pattern), start_(0) { |
| if (pattern.length() >= kBMMaxShift) { |
| start_ = pattern.length() - kBMMaxShift; |
| } |
| |
| size_t pattern_length = pattern_.length(); |
| CHECK_GT(pattern_length, 0); |
| if (pattern_length < kBMMinPatternLength) { |
| if (pattern_length == 1) { |
| strategy_ = &StringSearch::SingleCharSearch; |
| return; |
| } |
| strategy_ = &StringSearch::LinearSearch; |
| return; |
| } |
| strategy_ = &StringSearch::InitialSearch; |
| } |
| |
| size_t Search(Vector subject, size_t index) { |
| return (this->*strategy_)(subject, index); |
| } |
| |
| static inline int AlphabetSize() { |
| if (sizeof(Char) == 1) { |
| // Latin1 needle. |
| return kLatin1AlphabetSize; |
| } else { |
| // UC16 needle. |
| return kUC16AlphabetSize; |
| } |
| |
| static_assert(sizeof(Char) == sizeof(uint8_t) || |
| sizeof(Char) == sizeof(uint16_t), |
| "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)"); |
| } |
| |
| private: |
| typedef size_t (StringSearch::*SearchFunction)(Vector, size_t); |
| size_t SingleCharSearch(Vector subject, size_t start_index); |
| size_t LinearSearch(Vector subject, size_t start_index); |
| size_t InitialSearch(Vector subject, size_t start_index); |
| size_t BoyerMooreHorspoolSearch(Vector subject, size_t start_index); |
| size_t BoyerMooreSearch(Vector subject, size_t start_index); |
| |
| void PopulateBoyerMooreHorspoolTable(); |
| |
| void PopulateBoyerMooreTable(); |
| |
| static inline int CharOccurrence(int* bad_char_occurrence, |
| Char char_code) { |
| if (sizeof(Char) == 1) { |
| return bad_char_occurrence[static_cast<int>(char_code)]; |
| } |
| // Both pattern and subject are UC16. Reduce character to equivalence class. |
| int equiv_class = char_code % kUC16AlphabetSize; |
| return bad_char_occurrence[equiv_class]; |
| } |
| |
| // The pattern to search for. |
| Vector pattern_; |
| // Pointer to implementation of the search. |
| SearchFunction strategy_; |
| // Cache value of Max(0, pattern_length() - kBMMaxShift) |
| size_t start_; |
| }; |
| |
| |
| template <typename T, typename U> |
| inline T AlignDown(T value, U alignment) { |
| return reinterpret_cast<T>( |
| (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1))); |
| } |
| |
| |
| inline uint8_t GetHighestValueByte(uint16_t character) { |
| return std::max(static_cast<uint8_t>(character & 0xFF), |
| static_cast<uint8_t>(character >> 8)); |
| } |
| |
| |
| inline uint8_t GetHighestValueByte(uint8_t character) { return character; } |
| |
| |
| // Searches for a byte value in a memory buffer, back to front. |
| // Uses memrchr(3) on systems which support it, for speed. |
| // Falls back to a vanilla for loop on non-GNU systems such as Windows. |
| inline const void* MemrchrFill(const void* haystack, uint8_t needle, |
| size_t haystack_len) { |
| #ifdef _GNU_SOURCE |
| return memrchr(haystack, needle, haystack_len); |
| #else |
| const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack); |
| for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) { |
| if (haystack8[i] == needle) { |
| return haystack8 + i; |
| } |
| } |
| return nullptr; |
| #endif |
| } |
| |
| |
| // Finds the first occurrence of *two-byte* character pattern[0] in the string |
| // `subject`. Does not check that the whole pattern matches. |
| template <typename Char> |
| inline size_t FindFirstCharacter(Vector<const Char> pattern, |
| Vector<const Char> subject, size_t index) { |
| const Char pattern_first_char = pattern[0]; |
| const size_t max_n = (subject.length() - pattern.length() + 1); |
| |
| // For speed, search for the more `rare` of the two bytes in pattern[0] |
| // using memchr / memrchr (which are much faster than a simple for loop). |
| const uint8_t search_byte = GetHighestValueByte(pattern_first_char); |
| size_t pos = index; |
| do { |
| const size_t bytes_to_search = (max_n - pos) * sizeof(Char); |
| const void* void_pos; |
| if (subject.forward()) { |
| // Assert that bytes_to_search won't overflow |
| CHECK_LE(pos, max_n); |
| CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char)); |
| void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search); |
| } else { |
| CHECK_LE(pos, subject.length()); |
| CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char)); |
| void_pos = MemrchrFill(subject.start() + pattern.length() - 1, |
| search_byte, |
| bytes_to_search); |
| } |
| const Char* char_pos = static_cast<const Char*>(void_pos); |
| if (char_pos == nullptr) |
| return subject.length(); |
| |
| // Then, for each match, verify that the full two bytes match pattern[0]. |
| char_pos = AlignDown(char_pos, sizeof(Char)); |
| size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); |
| pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1); |
| if (subject[pos] == pattern_first_char) { |
| // Match found, hooray. |
| return pos; |
| } |
| // Search byte matched, but the other byte of pattern[0] didn't. Keep going. |
| } while (++pos < max_n); |
| |
| return subject.length(); |
| } |
| |
| |
| // Finds the first occurrence of the byte pattern[0] in string `subject`. |
| // Does not verify that the whole pattern matches. |
| template <> |
| inline size_t FindFirstCharacter(Vector<const uint8_t> pattern, |
| Vector<const uint8_t> subject, |
| size_t index) { |
| const uint8_t pattern_first_char = pattern[0]; |
| const size_t subj_len = subject.length(); |
| const size_t max_n = (subject.length() - pattern.length() + 1); |
| |
| const void* pos; |
| if (subject.forward()) { |
| pos = memchr(subject.start() + index, pattern_first_char, max_n - index); |
| } else { |
| pos = MemrchrFill(subject.start() + pattern.length() - 1, |
| pattern_first_char, |
| max_n - index); |
| } |
| const uint8_t* char_pos = static_cast<const uint8_t*>(pos); |
| if (char_pos == nullptr) { |
| return subj_len; |
| } |
| |
| size_t raw_pos = static_cast<size_t>(char_pos - subject.start()); |
| return subject.forward() ? raw_pos : (subj_len - raw_pos - 1); |
| } |
| |
| //--------------------------------------------------------------------- |
| // Single Character Pattern Search Strategy |
| //--------------------------------------------------------------------- |
| |
| template <typename Char> |
| size_t StringSearch<Char>::SingleCharSearch( |
| Vector subject, |
| size_t index) { |
| CHECK_EQ(1, pattern_.length()); |
| return FindFirstCharacter(pattern_, subject, index); |
| } |
| |
| //--------------------------------------------------------------------- |
| // Linear Search Strategy |
| //--------------------------------------------------------------------- |
| |
| // Simple linear search for short patterns. Never bails out. |
| template <typename Char> |
| size_t StringSearch<Char>::LinearSearch( |
| Vector subject, |
| size_t index) { |
| CHECK_GT(pattern_.length(), 1); |
| const size_t n = subject.length() - pattern_.length(); |
| for (size_t i = index; i <= n; i++) { |
| i = FindFirstCharacter(pattern_, subject, i); |
| if (i == subject.length()) |
| return subject.length(); |
| CHECK_LE(i, n); |
| |
| bool matches = true; |
| for (size_t j = 1; j < pattern_.length(); j++) { |
| if (pattern_[j] != subject[i + j]) { |
| matches = false; |
| break; |
| } |
| } |
| if (matches) { |
| return i; |
| } |
| } |
| return subject.length(); |
| } |
| |
| //--------------------------------------------------------------------- |
| // Boyer-Moore string search |
| //--------------------------------------------------------------------- |
| |
| template <typename Char> |
| size_t StringSearch<Char>::BoyerMooreSearch( |
| Vector subject, |
| size_t start_index) { |
| const size_t subject_length = subject.length(); |
| const size_t pattern_length = pattern_.length(); |
| // Only preprocess at most kBMMaxShift last characters of pattern. |
| size_t start = start_; |
| |
| int* bad_char_occurrence = bad_char_shift_table_; |
| int* good_suffix_shift = good_suffix_shift_table_ - start_; |
| |
| Char last_char = pattern_[pattern_length - 1]; |
| size_t index = start_index; |
| // Continue search from i. |
| while (index <= subject_length - pattern_length) { |
| size_t j = pattern_length - 1; |
| int c; |
| while (last_char != (c = subject[index + j])) { |
| int shift = j - CharOccurrence(bad_char_occurrence, c); |
| index += shift; |
| if (index > subject_length - pattern_length) { |
| return subject.length(); |
| } |
| } |
| while (pattern_[j] == (c = subject[index + j])) { |
| if (j == 0) { |
| return index; |
| } |
| j--; |
| } |
| if (j < start) { |
| // we have matched more than our tables allow us to be smart about. |
| // Fall back on BMH shift. |
| index += pattern_length - 1 - |
| CharOccurrence(bad_char_occurrence, |
| static_cast<Char>(last_char)); |
| } else { |
| int gs_shift = good_suffix_shift[j + 1]; |
| int bc_occ = CharOccurrence(bad_char_occurrence, c); |
| int shift = j - bc_occ; |
| if (gs_shift > shift) { |
| shift = gs_shift; |
| } |
| index += shift; |
| } |
| } |
| |
| return subject.length(); |
| } |
| |
| template <typename Char> |
| void StringSearch<Char>::PopulateBoyerMooreTable() { |
| const size_t pattern_length = pattern_.length(); |
| // Only look at the last kBMMaxShift characters of pattern (from start_ |
| // to pattern_length). |
| const size_t start = start_; |
| const size_t length = pattern_length - start; |
| |
| // Biased tables so that we can use pattern indices as table indices, |
| // even if we only cover the part of the pattern from offset start. |
| int* shift_table = good_suffix_shift_table_ - start_; |
| int* suffix_table = suffix_table_ - start_; |
| |
| // Initialize table. |
| for (size_t i = start; i < pattern_length; i++) { |
| shift_table[i] = length; |
| } |
| shift_table[pattern_length] = 1; |
| suffix_table[pattern_length] = pattern_length + 1; |
| |
| if (pattern_length <= start) { |
| return; |
| } |
| |
| // Find suffixes. |
| Char last_char = pattern_[pattern_length - 1]; |
| size_t suffix = pattern_length + 1; |
| { |
| size_t i = pattern_length; |
| while (i > start) { |
| Char c = pattern_[i - 1]; |
| while (suffix <= pattern_length && c != pattern_[suffix - 1]) { |
| if (static_cast<size_t>(shift_table[suffix]) == length) { |
| shift_table[suffix] = suffix - i; |
| } |
| suffix = suffix_table[suffix]; |
| } |
| suffix_table[--i] = --suffix; |
| if (suffix == pattern_length) { |
| // No suffix to extend, so we check against last_char only. |
| while ((i > start) && (pattern_[i - 1] != last_char)) { |
| if (static_cast<size_t>(shift_table[pattern_length]) == length) { |
| shift_table[pattern_length] = pattern_length - i; |
| } |
| suffix_table[--i] = pattern_length; |
| } |
| if (i > start) { |
| suffix_table[--i] = --suffix; |
| } |
| } |
| } |
| } |
| // Build shift table using suffixes. |
| if (suffix < pattern_length) { |
| for (size_t i = start; i <= pattern_length; i++) { |
| if (static_cast<size_t>(shift_table[i]) == length) { |
| shift_table[i] = suffix - start; |
| } |
| if (i == suffix) { |
| suffix = suffix_table[suffix]; |
| } |
| } |
| } |
| } |
| |
| //--------------------------------------------------------------------- |
| // Boyer-Moore-Horspool string search. |
| //--------------------------------------------------------------------- |
| |
| template <typename Char> |
| size_t StringSearch<Char>::BoyerMooreHorspoolSearch( |
| Vector subject, |
| size_t start_index) { |
| const size_t subject_length = subject.length(); |
| const size_t pattern_length = pattern_.length(); |
| int* char_occurrences = bad_char_shift_table_; |
| int64_t badness = -pattern_length; |
| |
| // How bad we are doing without a good-suffix table. |
| Char last_char = pattern_[pattern_length - 1]; |
| int last_char_shift = |
| pattern_length - 1 - |
| CharOccurrence(char_occurrences, static_cast<Char>(last_char)); |
| |
| // Perform search |
| size_t index = start_index; // No matches found prior to this index. |
| while (index <= subject_length - pattern_length) { |
| size_t j = pattern_length - 1; |
| int subject_char; |
| while (last_char != (subject_char = subject[index + j])) { |
| int bc_occ = CharOccurrence(char_occurrences, subject_char); |
| int shift = j - bc_occ; |
| index += shift; |
| badness += 1 - shift; // at most zero, so badness cannot increase. |
| if (index > subject_length - pattern_length) { |
| return subject_length; |
| } |
| } |
| j--; |
| while (pattern_[j] == (subject[index + j])) { |
| if (j == 0) { |
| return index; |
| } |
| j--; |
| } |
| index += last_char_shift; |
| // Badness increases by the number of characters we have |
| // checked, and decreases by the number of characters we |
| // can skip by shifting. It's a measure of how we are doing |
| // compared to reading each character exactly once. |
| badness += (pattern_length - j) - last_char_shift; |
| if (badness > 0) { |
| PopulateBoyerMooreTable(); |
| strategy_ = &StringSearch::BoyerMooreSearch; |
| return BoyerMooreSearch(subject, index); |
| } |
| } |
| return subject.length(); |
| } |
| |
| template <typename Char> |
| void StringSearch<Char>::PopulateBoyerMooreHorspoolTable() { |
| const size_t pattern_length = pattern_.length(); |
| |
| int* bad_char_occurrence = bad_char_shift_table_; |
| |
| // Only preprocess at most kBMMaxShift last characters of pattern. |
| const size_t start = start_; |
| // Run forwards to populate bad_char_table, so that *last* instance |
| // of character equivalence class is the one registered. |
| // Notice: Doesn't include the last character. |
| const size_t table_size = AlphabetSize(); |
| if (start == 0) { |
| // All patterns less than kBMMaxShift in length. |
| memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence)); |
| } else { |
| for (size_t i = 0; i < table_size; i++) { |
| bad_char_occurrence[i] = start - 1; |
| } |
| } |
| for (size_t i = start; i < pattern_length - 1; i++) { |
| Char c = pattern_[i]; |
| int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize(); |
| bad_char_occurrence[bucket] = i; |
| } |
| } |
| |
| //--------------------------------------------------------------------- |
| // Linear string search with bailout to BMH. |
| //--------------------------------------------------------------------- |
| |
| // Simple linear search for short patterns, which bails out if the string |
| // isn't found very early in the subject. Upgrades to BoyerMooreHorspool. |
| template <typename Char> |
| size_t StringSearch<Char>::InitialSearch( |
| Vector subject, |
| size_t index) { |
| const size_t pattern_length = pattern_.length(); |
| // Badness is a count of how much work we have done. When we have |
| // done enough work we decide it's probably worth switching to a better |
| // algorithm. |
| int64_t badness = -10 - (pattern_length << 2); |
| |
| // We know our pattern is at least 2 characters, we cache the first so |
| // the common case of the first character not matching is faster. |
| for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) { |
| badness++; |
| if (badness <= 0) { |
| i = FindFirstCharacter(pattern_, subject, i); |
| if (i == subject.length()) |
| return subject.length(); |
| CHECK_LE(i, n); |
| size_t j = 1; |
| do { |
| if (pattern_[j] != subject[i + j]) { |
| break; |
| } |
| j++; |
| } while (j < pattern_length); |
| if (j == pattern_length) { |
| return i; |
| } |
| badness += j; |
| } else { |
| PopulateBoyerMooreHorspoolTable(); |
| strategy_ = &StringSearch::BoyerMooreHorspoolSearch; |
| return BoyerMooreHorspoolSearch(subject, i); |
| } |
| } |
| return subject.length(); |
| } |
| |
| // Perform a single stand-alone search. |
| // If searching multiple times for the same pattern, a search |
| // object should be constructed once and the Search function then called |
| // for each search. |
| template <typename Char> |
| size_t SearchString(Vector<const Char> subject, |
| Vector<const Char> pattern, |
| size_t start_index) { |
| StringSearch<Char> search(pattern); |
| return search.Search(subject, start_index); |
| } |
| } // namespace stringsearch |
| } // namespace node |
| |
| namespace node { |
| |
| template <typename Char> |
| size_t SearchString(const Char* haystack, |
| size_t haystack_length, |
| const Char* needle, |
| size_t needle_length, |
| size_t start_index, |
| bool is_forward) { |
| if (haystack_length < needle_length) return haystack_length; |
| // To do a reverse search (lastIndexOf instead of indexOf) without redundant |
| // code, create two vectors that are reversed views into the input strings. |
| // For example, v_needle[0] would return the *last* character of the needle. |
| // So we're searching for the first instance of rev(needle) in rev(haystack) |
| stringsearch::Vector<const Char> v_needle(needle, needle_length, is_forward); |
| stringsearch::Vector<const Char> v_haystack( |
| haystack, haystack_length, is_forward); |
| size_t diff = haystack_length - needle_length; |
| size_t relative_start_index; |
| if (is_forward) { |
| relative_start_index = start_index; |
| } else if (diff < start_index) { |
| relative_start_index = 0; |
| } else { |
| relative_start_index = diff - start_index; |
| } |
| size_t pos = node::stringsearch::SearchString( |
| v_haystack, v_needle, relative_start_index); |
| if (pos == haystack_length) { |
| // not found |
| return pos; |
| } |
| return is_forward ? pos : (haystack_length - needle_length - pos); |
| } |
| |
| template <size_t N> |
| size_t SearchString(const char* haystack, size_t haystack_length, |
| const char (&needle)[N]) { |
| return SearchString( |
| reinterpret_cast<const uint8_t*>(haystack), haystack_length, |
| reinterpret_cast<const uint8_t*>(needle), N - 1, 0, true); |
| } |
| |
| } // namespace node |
| |
| #endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS |
| |
| #endif // SRC_STRING_SEARCH_H_ |