src/unicodetext.cc - external/github.com/google/cld_3 - Git at Google

 // Copyright (C) 2006 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Author: Jim Meehan

 #include "unicodetext.h"

 #include "base.h"
 #include "utils.h"

 namespace chrome_lang_id {

 // *************** Data representation **********
 // Note: the copy constructor is undefined.

 void UnicodeText::Repr::PointTo(const char *data, int size) {
   if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
   data_ = const_cast<char *>(data);
   size_ = size;
   capacity_ = size;
   ours_ = false;
 }

 // *************** UnicodeText ******************

 UnicodeText::UnicodeText() {}

 UnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) {
   repr_.PointTo(buffer, byte_length);
   return *this;
 }

 UnicodeText::~UnicodeText() {}

 // ******************* UnicodeText::const_iterator *********************

 // The implementation of const_iterator would be nicer if it
 // inherited from boost::iterator_facade
 // (http://boost.org/libs/iterator/doc/iterator_facade.html).

 UnicodeText::const_iterator::const_iterator() : it_(0) {}

 UnicodeText::const_iterator &UnicodeText::const_iterator::operator=(
     const const_iterator &other) {
   if (&other != this) it_ = other.it_;
   return *this;
 }

 UnicodeText::const_iterator UnicodeText::begin() const {
   return const_iterator(repr_.data_);
 }

 UnicodeText::const_iterator UnicodeText::end() const {
   return const_iterator(repr_.data_ + repr_.size_);
 }

 char32 UnicodeText::const_iterator::operator*() const {
   // (We could call chartorune here, but that does some
   // error-checking, and we're guaranteed that our data is valid
   // UTF-8. Also, we expect this routine to be called very often. So
   // for speed, we do the calculation ourselves.)

   // Convert from UTF-8
   unsigned char byte1 = static_cast<unsigned char>(it_[0]);
   if (byte1 < 0x80) return byte1;

   unsigned char byte2 = static_cast<unsigned char>(it_[1]);
   if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) | (byte2 & 0x3F);

   unsigned char byte3 = static_cast<unsigned char>(it_[2]);
   if (byte1 < 0xF0) {
     return ((byte1 & 0x0F) << 12) | ((byte2 & 0x3F) << 6) | (byte3 & 0x3F);
   }

   unsigned char byte4 = static_cast<unsigned char>(it_[3]);
   return ((byte1 & 0x07) << 18) | ((byte2 & 0x3F) << 12) |
          ((byte3 & 0x3F) << 6) | (byte4 & 0x3F);
 }

 UnicodeText::const_iterator &UnicodeText::const_iterator::operator++() {
   it_ += chrome_lang_id::utils::OneCharLen(it_);
   return *this;
 }

 }  // namespace chrome_lang_id
	// Copyright (C) 2006 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	// Author: Jim Meehan

	#include "unicodetext.h"

	#include "base.h"
	#include "utils.h"

	namespace chrome_lang_id {

	// ************* Data representation ********
	// Note: the copy constructor is undefined.

	void UnicodeText::Repr::PointTo(const char *data, int size) {
	if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
	data_ = const_cast<char *>(data);
	size_ = size;
	capacity_ = size;
	ours_ = false;
	}

	// ************* UnicodeText ****************

	UnicodeText::UnicodeText() {}

	UnicodeText &UnicodeText::PointToUTF8(const char *buffer, int byte_length) {
	repr_.PointTo(buffer, byte_length);
	return *this;
	}

	UnicodeText::~UnicodeText() {}

	// ***************** UnicodeText::const_iterator *******************

	// The implementation of const_iterator would be nicer if it
	// inherited from boost::iterator_facade
	// (http://boost.org/libs/iterator/doc/iterator_facade.html).

	UnicodeText::const_iterator::const_iterator() : it_(0) {}

	UnicodeText::const_iterator &UnicodeText::const_iterator::operator=(
	const const_iterator &other) {
	if (&other != this) it_ = other.it_;
	return *this;
	}

	UnicodeText::const_iterator UnicodeText::begin() const {
	return const_iterator(repr_.data_);
	}

	UnicodeText::const_iterator UnicodeText::end() const {
	return const_iterator(repr_.data_ + repr_.size_);
	}

	char32 UnicodeText::const_iterator::operator*() const {
	// (We could call chartorune here, but that does some
	// error-checking, and we're guaranteed that our data is valid
	// UTF-8. Also, we expect this routine to be called very often. So
	// for speed, we do the calculation ourselves.)

	// Convert from UTF-8
	unsigned char byte1 = static_cast<unsigned char>(it_[0]);
	if (byte1 < 0x80) return byte1;

	unsigned char byte2 = static_cast<unsigned char>(it_[1]);
	if (byte1 < 0xE0) return ((byte1 & 0x1F) << 6) \| (byte2 & 0x3F);

	unsigned char byte3 = static_cast<unsigned char>(it_[2]);
	if (byte1 < 0xF0) {
	return ((byte1 & 0x0F) << 12) \| ((byte2 & 0x3F) << 6) \| (byte3 & 0x3F);
	}

	unsigned char byte4 = static_cast<unsigned char>(it_[3]);
	return ((byte1 & 0x07) << 18) \| ((byte2 & 0x3F) << 12) \|
	((byte3 & 0x3F) << 6) \| (byte4 & 0x3F);
	}

	UnicodeText::const_iterator &UnicodeText::const_iterator::operator++() {
	it_ += chrome_lang_id::utils::OneCharLen(it_);
	return *this;
	}

	} // namespace chrome_lang_id