blob: 1a339ad5b17f669c68894d765c1ab03ef385610b [file] [log] [blame]
/*
* Copyright (C) 2005-2023 Apple Inc. All rights reserved.
* Copyright (C) 2011 Google Inc. All rights reserved.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public License
* along with this library; see the file COPYING.LIB. If not, write to
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
* Boston, MA 02110-1301, USA.
*
*/
#pragma once
#include <unicode/ubrk.h>
#include <wtf/ASCIICType.h>
#include <wtf/StdLibExtras.h>
#include <wtf/text/TextBreakIterator.h>
#include <wtf/unicode/CharacterNames.h>
namespace WebCore {
class BreakLines {
public:
enum class NoBreakSpaceBehavior {
Normal,
Break,
};
enum class WordBreakBehavior {
Normal,
BreakAll,
KeepAll,
AutoPhrase,
};
enum class LineBreakRules {
Normal, // Fast path available when using default line-breaking rules within ASCII.
Special, // Uses ICU to handle special line-breaking rules.
};
template<LineBreakRules, WordBreakBehavior, NoBreakSpaceBehavior>
static inline unsigned nextBreakablePosition(CachedLineBreakIteratorFactory&, size_t startPosition);
static inline unsigned nextBreakablePosition(CachedLineBreakIteratorFactory& iterator, size_t startPosition)
{
return nextBreakablePosition<LineBreakRules::Normal, WordBreakBehavior::Normal, NoBreakSpaceBehavior::Normal>(iterator, startPosition);
}
static inline bool isBreakable(CachedLineBreakIteratorFactory&, unsigned startPosition, std::optional<unsigned>& nextBreakable, bool breakNBSP, bool canUseShortcut, bool keepAllWords, bool breakAnywhere);
private:
// Iterator implementations.
template<typename CharacterType, LineBreakRules, WordBreakBehavior, NoBreakSpaceBehavior>
static inline size_t nextBreakablePosition(CachedLineBreakIteratorFactory&, std::span<const CharacterType> string, size_t startPosition);
template<typename CharacterType, NoBreakSpaceBehavior>
static inline size_t nextBreakableSpace(std::span<const CharacterType> string, size_t startPosition);
static inline unsigned nextCharacter(CachedLineBreakIteratorFactory&, unsigned startPosition);
// Helper functions.
enum BreakClass : uint16_t;
template<LineBreakRules, NoBreakSpaceBehavior>
static inline BreakClass classify(char16_t character);
template<NoBreakSpaceBehavior>
static inline bool isBreakableSpace(char16_t character);
// Data types.
enum BreakClass : uint16_t {
// See UAX14 and LineBreak.txt
// https://www.unicode.org/Public/UCD/latest/ucd/LineBreak.txt
kIndeterminate = 0,
kAL = 1,
kID = 1 << 1,
kCM = 1 << 2,
kOP = 1 << 3,
kCP = 1 << 4,
kCL = 1 << 5,
kGL = 1 << 6,
kQU = 1 << 7,
kSP = 1 << 8,
kNU = kAL,
kWeird = 1 << 15,
// Currently we map
// 1. HL to AL
// 2. H2 and H3 to ID
// We also don't distinguish AL and NU.
// If we pull more logic into isBreakable, these may need to be distinguished.
};
template<typename CharacterType>
struct CharacterInfo {
CharacterType id { 0 };
BreakClass type { kIndeterminate };
CharacterInfo(CharacterType character = 0)
: id(character)
, type(kIndeterminate)
{ }
inline void set(CharacterType character)
{
id = character;
type = kIndeterminate;
}
operator CharacterType() const { return id; }
};
class LineBreakTable {
public:
static constexpr char16_t firstCharacter = '!';
static constexpr char16_t lastCharacter = 0xFF;
static constexpr unsigned rowCount = lastCharacter - firstCharacter + 1;
static constexpr unsigned columnCount = (lastCharacter - firstCharacter) / 8 + 1;
static inline bool unsafeLookup(char16_t before, char16_t after) // Must range check before calling.
{
const unsigned beforeIndex = before - firstCharacter;
const unsigned afterIndex = after - firstCharacter;
return breakTable[beforeIndex][afterIndex / 8] & (1 << (afterIndex % 8));
}
private:
WEBCORE_EXPORT static const std::array<std::array<uint8_t, columnCount>, rowCount> breakTable;
};
static const LineBreakTable lineBreakTable;
};
template<BreakLines::NoBreakSpaceBehavior nonBreakingSpaceBehavior>
inline bool BreakLines::isBreakableSpace(char16_t character)
{
switch (character) {
case ' ':
case '\n':
case '\t':
return true;
case noBreakSpace:
return nonBreakingSpaceBehavior == NoBreakSpaceBehavior::Break;
default:
return false;
}
}
template<typename CharacterType, BreakLines::LineBreakRules shortcutRules, BreakLines::WordBreakBehavior words, BreakLines::NoBreakSpaceBehavior nonBreakingSpaceBehavior>
inline size_t BreakLines::nextBreakablePosition(CachedLineBreakIteratorFactory& lineBreakIteratorFactory, std::span<const CharacterType> string, size_t startPosition)
{
// Don't break if positioned at start of primary context and there is no prior context.
auto priorContextLength = lineBreakIteratorFactory.priorContext().length();
if (!startPosition && !priorContextLength) {
if (string.size() <= 1)
return string.size();
startPosition++;
}
CharacterInfo<CharacterType> beforeBefore(startPosition > 1 ? string[startPosition - 2]
: static_cast<CharacterType>(lineBreakIteratorFactory.priorContext().secondToLastCharacter()));
CharacterInfo<CharacterType> before(startPosition > 0 ? string[startPosition - 1]
: static_cast<CharacterType>(lineBreakIteratorFactory.priorContext().lastCharacter()));
CharacterInfo<CharacterType> after;
std::optional<size_t> nextBreak;
for (size_t i = startPosition; i < string.size(); beforeBefore = before, before = after, ++i) {
after.set(string[i]);
// Breakable spaces.
if (isBreakableSpace<nonBreakingSpaceBehavior>(after))
return i;
// ASCII rapid lookup.
if constexpr (shortcutRules == LineBreakRules::Normal) { // Not valid for 'loose' line-breaking.
// Don't allow line breaking between '-' and a digit if the '-' may mean a minus sign in the context,
// while allow breaking in 'ABCD-1234' and '1234-5678' which may be in long URLs.
if (before == '-' && isASCIIDigit(after)) {
if (isASCIIAlphanumeric(beforeBefore))
return i;
continue;
}
// If both characters are ASCII, use a lookup table for enhanced speed
// and for compatibility with other browsers (see comments on lineBreakTable for details).
if (before <= lineBreakTable.lastCharacter && after <= lineBreakTable.lastCharacter) {
if (before >= lineBreakTable.firstCharacter && after >= lineBreakTable.firstCharacter) {
if (lineBreakTable.unsafeLookup(before, after))
return i;
} // Else at least one is an ASCII control character; don't break.
continue;
}
}
// Non-ASCII rapid lookup.
if constexpr (words != WordBreakBehavior::AutoPhrase) {
if (!before.type)
before.type = classify<shortcutRules, nonBreakingSpaceBehavior>(before);
after.type = classify<shortcutRules, nonBreakingSpaceBehavior>(after);
// Short-circuit the commonest cases: letter + letter.
unsigned pair = before.type | after.type;
// AL+AL SP+AL SP+QU AL+QU QU+QU QU+AL (after's SP is already filtered out).
if (!(pair & ~(kSP | kAL | kQU))) {
if constexpr (words == WordBreakBehavior::BreakAll) {
if (pair == kAL)
return i;
}
continue;
}
if ((pair | kAL) == (kID | kAL)) {
if constexpr (words == WordBreakBehavior::KeepAll)
continue;
return i;
}
// Handle special cases.
if (pair & (kGL | kQU) && !(pair & kWeird)) // Keep nbsp high in our list.
continue;
if (after.type == kCM) {
after.type = before.type;
continue;
}
if constexpr (shortcutRules == LineBreakRules::Normal) {
if (!(pair & kWeird)) {
// Handle some common and obvious punctuation behaviors.
if (pair & (kCL | kCP | kOP)) {
if (after.type == kCL || after.type == kCP || before.type == kOP)
continue;
if (pair & kID)
return i;
}
}
}
}
// ICU lookup (slow).
if (!nextBreak || nextBreak.value() < i) {
auto& breakIterator = lineBreakIteratorFactory.get();
nextBreak = breakIterator.following(i - 1);
}
// Fast forward while our behavior matches ICU.
if (nextBreak && i < nextBreak.value()) {
for (size_t max = std::min(nextBreak.value(), string.size() - 1); i < max; beforeBefore = before, before = after, ++i) {
CharacterType lookahead = string[i + 1];
if ((lookahead <= lineBreakTable.lastCharacter && !isASCIIAlpha(lookahead))
|| (nonBreakingSpaceBehavior == NoBreakSpaceBehavior::Break && lookahead == noBreakSpace))
break;
}
}
if (i == nextBreak && !isBreakableSpace<nonBreakingSpaceBehavior>(before))
return i;
}
return string.size();
}
template<typename CharacterType, BreakLines::NoBreakSpaceBehavior nonBreakingSpaceBehavior>
inline size_t BreakLines::nextBreakableSpace(std::span<const CharacterType> string, size_t startPosition)
{
// FIXME: Use ICU instead.
for (size_t i = startPosition; i < string.size(); ++i) {
if (isBreakableSpace<nonBreakingSpaceBehavior>(string[i]))
return i;
// FIXME: This should either be in isBreakableSpace (though previous attempts broke the world) or should use ICU instead.
if (string[i] == zeroWidthSpace)
return i;
if (string[i] == ideographicSpace)
return i + 1;
}
return string.size();
}
inline unsigned BreakLines::nextCharacter(CachedLineBreakIteratorFactory& lineBreakIteratorFactory, unsigned startPosition)
{
auto stringView = lineBreakIteratorFactory.stringView();
ASSERT(startPosition <= stringView.length());
// FIXME: Can/Should we implement this using a Shared Iterator (performance issue)
// https://bugs.webkit.org/show_bug.cgi?id=197876
NonSharedCharacterBreakIterator iterator(stringView);
std::optional<unsigned> next = ubrk_following(iterator, startPosition);
return next.value_or(stringView.length());
}
template<BreakLines::LineBreakRules rules, BreakLines::WordBreakBehavior words, BreakLines::NoBreakSpaceBehavior spaces>
inline unsigned BreakLines::nextBreakablePosition(CachedLineBreakIteratorFactory& lineBreakIteratorFactory, size_t startPosition)
{
auto stringView = lineBreakIteratorFactory.stringView();
if (stringView.is8Bit()) {
return words == WordBreakBehavior::KeepAll
? nextBreakableSpace<LChar, spaces>(stringView.span8(), startPosition)
: nextBreakablePosition<LChar, rules, words, spaces>(lineBreakIteratorFactory, stringView.span8(), startPosition);
}
return words == WordBreakBehavior::KeepAll
? nextBreakableSpace<char16_t, spaces>(stringView.span16(), startPosition)
: nextBreakablePosition<char16_t, rules, words, spaces>(lineBreakIteratorFactory, stringView.span16(), startPosition);
}
inline bool BreakLines::isBreakable(CachedLineBreakIteratorFactory& lineBreakIteratorFactory, unsigned startPosition, std::optional<unsigned>& nextBreakable, bool breakNBSP, bool canUseShortcut, bool keepAllWords, bool breakAnywhere)
{
if (nextBreakable && nextBreakable.value() >= startPosition)
return startPosition == nextBreakable;
if (breakAnywhere)
return startPosition == nextCharacter(lineBreakIteratorFactory, startPosition);
if (keepAllWords) {
if (breakNBSP)
return startPosition == nextBreakablePosition<LineBreakRules::Special, WordBreakBehavior::KeepAll, NoBreakSpaceBehavior::Break>(lineBreakIteratorFactory, startPosition);
return startPosition == nextBreakablePosition<LineBreakRules::Special, WordBreakBehavior::KeepAll, NoBreakSpaceBehavior::Normal>(lineBreakIteratorFactory, startPosition);
}
if (canUseShortcut) {
if (breakNBSP)
return startPosition == nextBreakablePosition<LineBreakRules::Normal, WordBreakBehavior::Normal, NoBreakSpaceBehavior::Break>(lineBreakIteratorFactory, startPosition);
return startPosition == nextBreakablePosition<LineBreakRules::Normal, WordBreakBehavior::Normal, NoBreakSpaceBehavior::Normal>(lineBreakIteratorFactory, startPosition);
}
if (breakNBSP)
return startPosition == nextBreakablePosition<LineBreakRules::Special, WordBreakBehavior::Normal, NoBreakSpaceBehavior::Break>(lineBreakIteratorFactory, startPosition);
return startPosition == nextBreakablePosition<LineBreakRules::Special, WordBreakBehavior::Normal, NoBreakSpaceBehavior::Normal>(lineBreakIteratorFactory, startPosition);
}
template<BreakLines::LineBreakRules rules, BreakLines::NoBreakSpaceBehavior nonBreakingSpaceBehavior>
inline BreakLines::BreakClass BreakLines::classify(char16_t character)
{
// This function is optimized for letters and NBSP.
// See LineBreak.txt for classification.
static constexpr char16_t blockLast3 = ~0x07;
static constexpr char16_t blockLast4 = ~0x0F;
static constexpr char16_t blockLast6 = ~0x3F;
static constexpr char16_t blockLast7 = ~0x7F;
static constexpr char16_t blockLast8 = ~0xFF;
switch ((character & blockLast7) / 0x80) {
// Compilers are not smart enough to make a jump table if the step is not 1.
case 0x0000 / 0x80: // ASCII
switch ((character & blockLast4) / 0x10) {
case 0x0000 / 0x10:
return kWeird;
case 0x0010 / 0x10:
return kCM;
case 0x0020 / 0x10:
if (character == 0x0020) // space
return kSP;
if (character == 0x0022 || character == 0x0027)
return kQU;
if (character == 0x0028)
return kOP;
if (character == 0x0029)
return kCP;
return kWeird;
case 0x0030 / 0x10:
if (character <= 0x0039) // Numbers.
return kNU;
return kWeird;
case 0x0040 / 0x10:
return kAL;
case 0x0050 / 0x10:
if (character <= 0x005A)
return kAL;
if (character == 0x005B)
return kOP;
if (character == 0x005D)
return kCP;
return kWeird;
case 0x0060 / 0x10:
return kAL;
case 0x0070 / 0x10:
if (character <= 0x007A)
return kAL;
if (character == 0x007B)
return kOP;
if (character == 0x007D)
return kCL;
return kWeird;
default:
return kWeird;
}
case 0x0080 / 0x80: // Latin-1
if (character == 0x00A0) // noBreakSpace
return nonBreakingSpaceBehavior == NoBreakSpaceBehavior::Normal ? kGL : kSP;
if (character > 0x00C0)
return kAL;
if (character == 0x00A1 || character == 0x00BF)
return kOP;
if (character == 0x00AB || character == 0x00BB)
return kQU;
return kWeird;
case 0x0100 / 0x80:
case 0x0180 / 0x80:
case 0x0200 / 0x80:
return kAL;
case 0x0280 / 0x80:
switch (character) {
case 0x02C8:
case 0x02CC:
case 0x02DF:
return kWeird;
default:
return kAL;
}
case 0x0300 / 0x80:
if (character == 0x034F || (0x035C <= character && character <= 0x0362))
return kGL;
if (character < 0x370)
return kCM;
if (character == 0x037E) [[unlikely]]
return kWeird;
return kAL;
case 0x0380 / 0x80:
case 0x0400 / 0x80:
return kAL;
case 0x0480 / 0x80:
if (0x0483 <= character && character <= 0x0489)
return kCM;
return kAL;
case 0x0500 / 0x80:
return kAL;
case 0x0580 / 0x80:
if (character <= 0x0588 || 0x05C8 <= character)
return kAL; // WARNING: Some of these are actually HL.
if (0x0591 <= character && character <= 0x05BD)
return kCM;
// 0x05BE to 0x05C7 is mixed up.
switch (character) {
case 0x05BF:
case 0x05C1:
case 0x05C2:
case 0x05C4:
case 0x05C5:
case 0x05C7:
return kCM;
default:
return kWeird;
}
case 0x0600 / 0x80:
case 0x0680 / 0x80:
case 0x0700 / 0x80:
case 0x0780 / 0x80:
case 0x0800 / 0x80:
case 0x0880 / 0x80:
case 0x0900 / 0x80:
case 0x0980 / 0x80:
case 0x1000 / 0x80:
case 0x1080 / 0x80:
case 0x1100 / 0x80:
case 0x1180 / 0x80:
case 0x1200 / 0x80:
case 0x1280 / 0x80:
case 0x1300 / 0x80:
case 0x1380 / 0x80:
case 0x1400 / 0x80:
case 0x1480 / 0x80:
case 0x1500 / 0x80:
case 0x1580 / 0x80:
case 0x1600 / 0x80:
case 0x1680 / 0x80:
case 0x1700 / 0x80:
case 0x1780 / 0x80:
case 0x1800 / 0x80:
case 0x1880 / 0x80:
case 0x1900 / 0x80:
case 0x1980 / 0x80:
return kWeird;
case 0x2000 / 0x80:
if (character == 0x2018 || character == 0x2019)
return kQU;
return kWeird;
// FIXME: Continue bitmask switch up to 2E80.
}
// CJK
if (0x2E80 <= character && character <= 0xA4CF) {
if ((character & blockLast8) == 0x3000) {
if (character <= 0x303F) {
// This block (0x3000-0x303F) is categorically very mixed up.
switch (character & 0x1F) {
case 0x3001 & 0x1F:
case 0x3002 & 0x1F:
case 0x3009 & 0x1F:
case 0x300B & 0x1F:
case 0x300D & 0x1F:
case 0x300F & 0x1F:
case 0x3011 & 0x1F:
case 0x3015 & 0x1F:
case 0x3017 & 0x1F:
case 0x3019 & 0x1F:
case 0x301B & 0x1F:
case 0x301E & 0x1F:
case 0x301F & 0x1F:
return kCL;
case 0x3008 & 0x1F:
case 0x300A & 0x1F:
case 0x300C & 0x1F:
case 0x300E & 0x1F:
case 0x3010 & 0x1F:
case 0x3016 & 0x1F:
case 0x3014 & 0x1F:
case 0x3018 & 0x1F:
case 0x301A & 0x1F:
case 0x301D & 0x1F:
return kOP;
default:
return kWeird;
}
}
// This block (0x3040-0x30FF) is tangled up and sensitive to 'line-break'.
return kWeird;
}
if ((character & blockLast4) == 0x31F0) [[unlikely]] // Small Kana.
return kWeird;
if ((character & blockLast3) == 0x3248) [[unlikely]]
return kAL;
if ((character & blockLast6) == 0x4DC0) [[unlikely]]
return kAL;
if (character == 0xA015) [[unlikely]]
return kWeird;
return kID;
}
// Precomposed Hangul
if (0xAC00 <= character && character <= 0xD7AF)
return kID; // WARNING: These are actually H2 or H3.
// More CJK
if (0xF900 <= character && character <= 0XFAFF)
return kID;
return kWeird;
}
} // namespace WebCore