blob: ec64ffa8725303f009ce16a07e394eb625115d54 [file] [edit]
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "text_processing.h"
#include <stdio.h>
#include <string.h>
namespace chrome_lang_id {
namespace CLD2 {
namespace {
static const int kMaxSpaceScan = 32; // Bytes
int minint(int a, int b) { return (a < b) ? a : b; }
// Counts number of spaces; a little faster than one-at-a-time
// Doesn't count odd bytes at end
int CountSpaces4(const char *src, int src_len) {
int s_count = 0;
for (int i = 0; i < (src_len & ~3); i += 4) {
s_count += (src[i] == ' ');
s_count += (src[i + 1] == ' ');
s_count += (src[i + 2] == ' ');
s_count += (src[i + 3] == ' ');
}
return s_count;
}
// This uses a cheap predictor to get a measure of compression, and
// hence a measure of repetitiveness. It works on complete UTF-8 characters
// instead of bytes, because three-byte UTF-8 Indic, etc. text compress highly
// all the time when done with a byte-based count. Sigh.
//
// To allow running prediction across multiple chunks, caller passes in current
// 12-bit hash value and int[4096] prediction table. Caller inits these to 0.
//
// Returns the number of *bytes* correctly predicted, increments by 1..4 for
// each correctly-predicted character.
//
// NOTE: Overruns by up to three bytes. Not a problem with valid UTF-8 text
//
// TODO(dsites) make this use just one byte per UTF-8 char and incr by charlen
int CountPredictedBytes(const char *isrc, int src_len, int *hash, int *tbl) {
typedef unsigned char uint8;
int p_count = 0;
const uint8 *src = reinterpret_cast<const uint8 *>(isrc);
const uint8 *srclimit = src + src_len;
int local_hash = *hash;
while (src < srclimit) {
int c = src[0];
int incr = 1;
// Pick up one char and length
if (c < 0xc0) {
// One-byte or continuation byte: 00xxxxxx 01xxxxxx 10xxxxxx
// Do nothing more
} else if ((c & 0xe0) == 0xc0) {
// Two-byte
c = (c << 8) | src[1];
incr = 2;
} else if ((c & 0xf0) == 0xe0) {
// Three-byte
c = (c << 16) | (src[1] << 8) | src[2];
incr = 3;
} else {
// Four-byte
c = (c << 24) | (src[1] << 16) | (src[2] << 8) | src[3];
incr = 4;
}
src += incr;
int p = tbl[local_hash]; // Prediction
tbl[local_hash] = c; // Update prediction
if (c == p) {
p_count += incr; // Count bytes of good predictions
}
local_hash = ((local_hash << 4) ^ c) & 0xfff;
}
*hash = local_hash;
return p_count;
}
// Backscan to word boundary, returning how many bytes n to go back
// so that src - n is non-space ans src - n - 1 is space.
// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
int BackscanToSpace(const char *src, int limit) {
int n = 0;
limit = minint(limit, kMaxSpaceScan);
while (n < limit) {
if (src[-n - 1] == ' ') {
return n;
} // We are at _X
++n;
}
n = 0;
while (n < limit) {
if ((src[-n] & 0xc0) != 0x80) {
return n;
} // We are at char begin
++n;
}
return 0;
}
// Forwardscan to word boundary, returning how many bytes n to go forward
// so that src + n is non-space ans src + n - 1 is space.
// If not found in kMaxSpaceScan bytes, return 0..3 to a clean UTF-8 boundary
int ForwardscanToSpace(const char *src, int limit) {
int n = 0;
limit = minint(limit, kMaxSpaceScan);
while (n < limit) {
if (src[n] == ' ') {
return n + 1;
} // We are at _X
++n;
}
n = 0;
while (n < limit) {
if ((src[n] & 0xc0) != 0x80) {
return n;
} // We are at char begin
++n;
}
return 0;
}
} // namespace
// Must be exactly 4096 for cheap compressor.
static const int kPredictionTableSize = 4096;
static const int kChunksizeDefault = 48; // Squeeze 48-byte chunks
static const int kSpacesThreshPercent = 30; // Squeeze if >=30% spaces
static const int kPredictThreshPercent = 40; // Squeeze if >=40% predicted
// Remove portions of text that have a high density of spaces, or that are
// overly repetitive, squeezing the remaining text in-place to the front of the
// input buffer.
//
// Squeezing looks at density of space/prediced chars in fixed-size chunks,
// specified by chunksize. A chunksize <= 0 uses the default size of 48 bytes.
//
// Return the new, possibly-shorter length
//
// Result Buffer ALWAYS has leading space and trailing space space space NUL,
// if input does
//
int CheapSqueezeInplace(char *isrc, int src_len, int ichunksize) {
char *src = isrc;
char *dst = src;
char *srclimit = src + src_len;
bool skipping = false;
int hash = 0;
// Allocate local prediction table.
int *predict_tbl = new int[kPredictionTableSize];
memset(predict_tbl, 0, kPredictionTableSize * sizeof(predict_tbl[0]));
int chunksize = ichunksize;
if (chunksize == 0) {
chunksize = kChunksizeDefault;
}
int space_thresh = (chunksize * kSpacesThreshPercent) / 100;
int predict_thresh = (chunksize * kPredictThreshPercent) / 100;
while (src < srclimit) {
int remaining_bytes = srclimit - src;
int len = minint(chunksize, remaining_bytes);
// Make len land us on a UTF-8 character boundary.
// Ah. Also fixes mispredict because we could get out of phase
// Loop always terminates at trailing space in buffer
while ((src[len] & 0xc0) == 0x80) {
++len;
} // Move past continuation bytes
int space_n = CountSpaces4(src, len);
int predb_n = CountPredictedBytes(src, len, &hash, predict_tbl);
if ((space_n >= space_thresh) || (predb_n >= predict_thresh)) {
// Skip the text
if (!skipping) {
// Keeping-to-skipping transition; do it at a space
int n = BackscanToSpace(dst, static_cast<int>(dst - isrc));
dst -= n;
if (dst == isrc) {
// Force a leading space if the first chunk is deleted
*dst++ = ' ';
}
skipping = true;
}
} else {
// Keep the text
if (skipping) {
// Skipping-to-keeping transition; do it at a space
int n = ForwardscanToSpace(src, len);
src += n;
remaining_bytes -= n; // Shrink remaining length
len -= n;
skipping = false;
}
// "len" can be negative in some cases
if (len > 0) {
memmove(dst, src, len);
dst += len;
}
}
src += len;
}
if ((dst - isrc) < (src_len - 3)) {
// Pad and make last char clean UTF-8 by putting following spaces
dst[0] = ' ';
dst[1] = ' ';
dst[2] = ' ';
dst[3] = '\0';
} else if ((dst - isrc) < src_len) {
// Make last char clean UTF-8 by putting following space off the end
dst[0] = ' ';
}
// Deallocate local prediction table
delete[] predict_tbl;
return static_cast<int>(dst - isrc);
}
} // namespace CLD2
} // namespace chrome_lang_id