cpp/src/address_formatter.cc - external/libaddressinput - Git at Google

 // Copyright (C) 2014 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #include <libaddressinput/address_formatter.h>

 #include <libaddressinput/address_data.h>
 #include <libaddressinput/address_field.h>

 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <functional>
 #include <string>
 #include <vector>

 #include "format_element.h"
 #include "language.h"
 #include "region_data_constants.h"
 #include "rule.h"
 #include "util/cctype_tolower_equal.h"
 #include "util/size.h"

 namespace i18n {
 namespace addressinput {

 namespace {

 const char kCommaSeparator[] = ", ";
 const char kSpaceSeparator[] = " ";
 const char kArabicCommaSeparator[] = "، ";

 const char kLanguagesThatUseSpace[][3] = {
     "th",
     "ko",
 };

 const char kLanguagesThatHaveNoSeparator[][3] = {
     "ja",
     "zh",  // All Chinese variants.
 };

 // This data is based on CLDR, cross-checked with data provided by Chrome
 // linguists, for languages that are in official use in some country, where
 // Arabic is the most likely script tag.
 // TODO: Consider supporting variants such as tr-Arab by detecting the script
 // code.
 const char kLanguagesThatUseAnArabicComma[][3] = {
     "ar",
     "fa",
     "ku",
     "ps",
     "ur",
 };

 std::string GetLineSeparatorForLanguage(const std::string& language_tag) {
   Language address_language(language_tag);

   // First deal with explicit script tags.
   if (address_language.has_latin_script) {
     return kCommaSeparator;
   }

   // Now guess something appropriate based on the base language.
   const std::string& base_language = address_language.base;
   using std::placeholders::_1;
   if (std::find_if(kLanguagesThatUseSpace,
                    kLanguagesThatUseSpace + size(kLanguagesThatUseSpace),
                    std::bind(&EqualToTolowerString, _1, base_language)) !=
       kLanguagesThatUseSpace + size(kLanguagesThatUseSpace)) {
     return kSpaceSeparator;
   } else if (std::find_if(
                  kLanguagesThatHaveNoSeparator,
                  kLanguagesThatHaveNoSeparator +
                      size(kLanguagesThatHaveNoSeparator),
                  std::bind(&EqualToTolowerString, _1, base_language)) !=
              kLanguagesThatHaveNoSeparator +
                  size(kLanguagesThatHaveNoSeparator)) {
     return "";
   } else if (std::find_if(
                  kLanguagesThatUseAnArabicComma,
                  kLanguagesThatUseAnArabicComma +
                      size(kLanguagesThatUseAnArabicComma),
                  std::bind(&EqualToTolowerString, _1, base_language)) !=
              kLanguagesThatUseAnArabicComma +
                  size(kLanguagesThatUseAnArabicComma)) {
     return kArabicCommaSeparator;
   }
   // Either the language is a Latin-script language, or no language was
   // specified. In the latter case we still return ", " as the most common
   // separator in use. In countries that don't use this, e.g. Thailand,
   // addresses are often written in Latin script where this would still be
   // appropriate, so this is a reasonable default in the absence of information.
   return kCommaSeparator;
 }

 void CombineLinesForLanguage(const std::vector<std::string>& lines,
                              const std::string& language_tag,
                              std::string* line) {
   line->clear();
   std::string separator = GetLineSeparatorForLanguage(language_tag);
   for (auto it = lines.begin(); it != lines.end(); ++it) {
     if (it != lines.begin()) {
       line->append(separator);
     }
     line->append(*it);
   }
 }

 }  // namespace

 void GetFormattedNationalAddress(
     const AddressData& address_data, std::vector<std::string>* lines) {
   assert(lines != nullptr);
   lines->clear();

   Rule rule;
   rule.CopyFrom(Rule::GetDefault());
   // TODO: Eventually, we should get the best rule for this country and
   // language, rather than just for the country.
   rule.ParseSerializedRule(
       RegionDataConstants::GetRegionData(address_data.region_code));

   Language language(address_data.language_code);

   // If Latin-script rules are available and the |language_code| of this address
   // is explicitly tagged as being Latin, then use the Latin-script formatting
   // rules.
   const std::vector<FormatElement>& format =
       language.has_latin_script && !rule.GetLatinFormat().empty()
           ? rule.GetLatinFormat()
           : rule.GetFormat();

   // Address format without the unnecessary elements (based on which address
   // fields are empty). We assume all literal strings that are not at the start
   // or end of a line are separators, and therefore only relevant if the
   // surrounding fields are filled in. This works with the data we have
   // currently.
   std::vector<FormatElement> pruned_format;
   for (auto element_it = format.begin();
        element_it != format.end();
        ++element_it) {
     // Always keep the newlines.
     if (element_it->IsNewline() ||
         // Always keep the non-empty address fields.
         (element_it->IsField() &&
          !address_data.IsFieldEmpty(element_it->GetField())) ||
         // Only keep literals that satisfy these 2 conditions:
         (!element_it->IsField() &&
          // (1) Not preceding an empty field.
          (element_it + 1 == format.end() ||
           !(element_it + 1)->IsField() ||
           !address_data.IsFieldEmpty((element_it + 1)->GetField())) &&
          // (2) Not following a removed field.
          (element_it == format.begin() ||
           !(element_it - 1)->IsField() ||
           (!pruned_format.empty() && pruned_format.back().IsField())))) {
       pruned_format.push_back(*element_it);
     }
   }

   std::string line;
   for (const auto& element : pruned_format) {
     if (element.IsNewline()) {
       if (!line.empty()) {
         lines->push_back(line);
         line.clear();
       }
     } else if (element.IsField()) {
       AddressField field = element.GetField();
       if (field == STREET_ADDRESS) {
         // The field "street address" represents the street address lines of an
         // address, so there can be multiple values.
         if (!address_data.IsFieldEmpty(field)) {
           line.append(address_data.address_line.front());
           if (address_data.address_line.size() > 1U) {
             lines->push_back(line);
             line.clear();
             const auto last_element_iterator =
                 address_data.address_line.begin() +
                 address_data.address_line.size() - 1;
             lines->insert(lines->end(), address_data.address_line.begin() + 1,
                           last_element_iterator);
             line.append(*last_element_iterator);
           }
         }
       } else {
         line.append(address_data.GetFieldValue(field));
       }
     } else {
       line.append(element.GetLiteral());
     }
   }
   if (!line.empty()) {
     lines->push_back(line);
   }
 }

 void GetFormattedNationalAddressLine(
     const AddressData& address_data, std::string* line) {
   std::vector<std::string> address_lines;
   GetFormattedNationalAddress(address_data, &address_lines);
   CombineLinesForLanguage(address_lines, address_data.language_code, line);
 }

 void GetStreetAddressLinesAsSingleLine(
     const AddressData& address_data, std::string* line) {
   CombineLinesForLanguage(
       address_data.address_line, address_data.language_code, line);
 }

 }  // namespace addressinput
 }  // namespace i18n
	// Copyright (C) 2014 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#include <libaddressinput/address_formatter.h>

	#include <libaddressinput/address_data.h>
	#include <libaddressinput/address_field.h>

	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <functional>
	#include <string>
	#include <vector>

	#include "format_element.h"
	#include "language.h"
	#include "region_data_constants.h"
	#include "rule.h"
	#include "util/cctype_tolower_equal.h"
	#include "util/size.h"

	namespace i18n {
	namespace addressinput {

	namespace {

	const char kCommaSeparator[] = ", ";
	const char kSpaceSeparator[] = " ";
	const char kArabicCommaSeparator[] = "، ";

	const char kLanguagesThatUseSpace[][3] = {
	"th",
	"ko",
	};

	const char kLanguagesThatHaveNoSeparator[][3] = {
	"ja",
	"zh", // All Chinese variants.
	};

	// This data is based on CLDR, cross-checked with data provided by Chrome
	// linguists, for languages that are in official use in some country, where
	// Arabic is the most likely script tag.
	// TODO: Consider supporting variants such as tr-Arab by detecting the script
	// code.
	const char kLanguagesThatUseAnArabicComma[][3] = {
	"ar",
	"fa",
	"ku",
	"ps",
	"ur",
	};

	std::string GetLineSeparatorForLanguage(const std::string& language_tag) {
	Language address_language(language_tag);

	// First deal with explicit script tags.
	if (address_language.has_latin_script) {
	return kCommaSeparator;
	}

	// Now guess something appropriate based on the base language.
	const std::string& base_language = address_language.base;
	using std::placeholders::_1;
	if (std::find_if(kLanguagesThatUseSpace,
	kLanguagesThatUseSpace + size(kLanguagesThatUseSpace),
	std::bind(&EqualToTolowerString, _1, base_language)) !=
	kLanguagesThatUseSpace + size(kLanguagesThatUseSpace)) {
	return kSpaceSeparator;
	} else if (std::find_if(
	kLanguagesThatHaveNoSeparator,
	kLanguagesThatHaveNoSeparator +
	size(kLanguagesThatHaveNoSeparator),
	std::bind(&EqualToTolowerString, _1, base_language)) !=
	kLanguagesThatHaveNoSeparator +
	size(kLanguagesThatHaveNoSeparator)) {
	return "";
	} else if (std::find_if(
	kLanguagesThatUseAnArabicComma,
	kLanguagesThatUseAnArabicComma +
	size(kLanguagesThatUseAnArabicComma),
	std::bind(&EqualToTolowerString, _1, base_language)) !=
	kLanguagesThatUseAnArabicComma +
	size(kLanguagesThatUseAnArabicComma)) {
	return kArabicCommaSeparator;
	}
	// Either the language is a Latin-script language, or no language was
	// specified. In the latter case we still return ", " as the most common
	// separator in use. In countries that don't use this, e.g. Thailand,
	// addresses are often written in Latin script where this would still be
	// appropriate, so this is a reasonable default in the absence of information.
	return kCommaSeparator;
	}

	void CombineLinesForLanguage(const std::vector<std::string>& lines,
	const std::string& language_tag,
	std::string* line) {
	line->clear();
	std::string separator = GetLineSeparatorForLanguage(language_tag);
	for (auto it = lines.begin(); it != lines.end(); ++it) {
	if (it != lines.begin()) {
	line->append(separator);
	}
	line->append(*it);
	}
	}

	} // namespace

	void GetFormattedNationalAddress(
	const AddressData& address_data, std::vector<std::string>* lines) {
	assert(lines != nullptr);
	lines->clear();

	Rule rule;
	rule.CopyFrom(Rule::GetDefault());
	// TODO: Eventually, we should get the best rule for this country and
	// language, rather than just for the country.
	rule.ParseSerializedRule(
	RegionDataConstants::GetRegionData(address_data.region_code));

	Language language(address_data.language_code);

	// If Latin-script rules are available and the \|language_code\| of this address
	// is explicitly tagged as being Latin, then use the Latin-script formatting
	// rules.
	const std::vector<FormatElement>& format =
	language.has_latin_script && !rule.GetLatinFormat().empty()
	? rule.GetLatinFormat()
	: rule.GetFormat();

	// Address format without the unnecessary elements (based on which address
	// fields are empty). We assume all literal strings that are not at the start
	// or end of a line are separators, and therefore only relevant if the
	// surrounding fields are filled in. This works with the data we have
	// currently.
	std::vector<FormatElement> pruned_format;
	for (auto element_it = format.begin();
	element_it != format.end();
	++element_it) {
	// Always keep the newlines.
	if (element_it->IsNewline() \|\|
	// Always keep the non-empty address fields.
	(element_it->IsField() &&
	!address_data.IsFieldEmpty(element_it->GetField())) \|\|
	// Only keep literals that satisfy these 2 conditions:
	(!element_it->IsField() &&
	// (1) Not preceding an empty field.
	(element_it + 1 == format.end() \|\|
	!(element_it + 1)->IsField() \|\|
	!address_data.IsFieldEmpty((element_it + 1)->GetField())) &&
	// (2) Not following a removed field.
	(element_it == format.begin() \|\|
	!(element_it - 1)->IsField() \|\|
	(!pruned_format.empty() && pruned_format.back().IsField())))) {
	pruned_format.push_back(*element_it);
	}
	}

	std::string line;
	for (const auto& element : pruned_format) {
	if (element.IsNewline()) {
	if (!line.empty()) {
	lines->push_back(line);
	line.clear();
	}
	} else if (element.IsField()) {
	AddressField field = element.GetField();
	if (field == STREET_ADDRESS) {
	// The field "street address" represents the street address lines of an
	// address, so there can be multiple values.
	if (!address_data.IsFieldEmpty(field)) {
	line.append(address_data.address_line.front());
	if (address_data.address_line.size() > 1U) {
	lines->push_back(line);
	line.clear();
	const auto last_element_iterator =
	address_data.address_line.begin() +
	address_data.address_line.size() - 1;
	lines->insert(lines->end(), address_data.address_line.begin() + 1,
	last_element_iterator);
	line.append(*last_element_iterator);
	}
	}
	} else {
	line.append(address_data.GetFieldValue(field));
	}
	} else {
	line.append(element.GetLiteral());
	}
	}
	if (!line.empty()) {
	lines->push_back(line);
	}
	}

	void GetFormattedNationalAddressLine(
	const AddressData& address_data, std::string* line) {
	std::vector<std::string> address_lines;
	GetFormattedNationalAddress(address_data, &address_lines);
	CombineLinesForLanguage(address_lines, address_data.language_code, line);
	}

	void GetStreetAddressLinesAsSingleLine(
	const AddressData& address_data, std::string* line) {
	CombineLinesForLanguage(
	address_data.address_line, address_data.language_code, line);
	}

	} // namespace addressinput
	} // namespace i18n