blob: 50bea8f351b4996d449b0857880e4841c980f7c7 [file] [edit]
/* Copyright 2016 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "script_detector.h"
#include <iostream>
#include "utils.h"
namespace chrome_lang_id {
namespace script_detector_test {
Script GetScript(const char *p) {
const int num_bytes = utils::OneCharLen(p);
return chrome_lang_id::GetScript(p, num_bytes);
}
bool PrintAndReturnStatus(bool status) {
if (status) {
std::cout << " Success" << std::endl;
return true;
} else {
std::cout << " Failure" << std::endl;
return false;
}
}
bool TestGreekScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
// The first two conditions check first / last character from the Greek and
// Coptic script. The last two ones are negative tests.
return PrintAndReturnStatus(
kScriptGreek == GetScript("Ͱ") && kScriptGreek == GetScript("Ͽ") &&
kScriptGreek == GetScript("δ") && kScriptGreek == GetScript("Θ") &&
kScriptGreek == GetScript("Δ") && kScriptGreek != GetScript("a") &&
kScriptGreek != GetScript("0"));
}
bool TestCyrillicScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
return PrintAndReturnStatus(
kScriptCyrillic == GetScript("Ѐ") && kScriptCyrillic == GetScript("ӿ") &&
kScriptCyrillic == GetScript("ш") && kScriptCyrillic == GetScript("Б") &&
kScriptCyrillic == GetScript("Ӱ"));
}
bool TestHebrewScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
return PrintAndReturnStatus(
kScriptHebrew == GetScript("֑") && kScriptHebrew == GetScript("״") &&
kScriptHebrew == GetScript("ד") && kScriptHebrew == GetScript("ה") &&
kScriptHebrew == GetScript("צ"));
}
bool TestArabicScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
return PrintAndReturnStatus(kScriptArabic == GetScript("م") &&
kScriptArabic == GetScript("خ"));
}
bool TestHangulJamoScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
return PrintAndReturnStatus(kScriptHangulJamo == GetScript("ᄀ") &&
kScriptHangulJamo == GetScript("ᇿ") &&
kScriptHangulJamo == GetScript("ᄡ") &&
kScriptHangulJamo == GetScript("ᆅ") &&
kScriptHangulJamo == GetScript("ᅘ"));
}
bool TestHiraganaScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
return PrintAndReturnStatus(kScriptHiragana == GetScript("ぁ") &&
kScriptHiragana == GetScript("ゟ") &&
kScriptHiragana == GetScript("こ") &&
kScriptHiragana == GetScript("や") &&
kScriptHiragana == GetScript("ぜ"));
}
bool TestKatakanaScript() {
std::cout << "Running " << __FUNCTION__ << std::endl;
return PrintAndReturnStatus(kScriptKatakana == GetScript("゠") &&
kScriptKatakana == GetScript("ヿ") &&
kScriptKatakana == GetScript("ヂ") &&
kScriptKatakana == GetScript("ザ") &&
kScriptKatakana == GetScript("ヸ"));
}
bool TestOtherScripts() {
std::cout << "Running " << __FUNCTION__ << std::endl;
bool test_successful = true;
if (kScriptOtherUtf8OneByte != GetScript("^") ||
kScriptOtherUtf8OneByte != GetScript("$")) {
test_successful = false;
}
// Unrecognized 2-byte scripts. For info on the scripts mentioned below, see
// http://www.unicode.org/charts/#scripts Note: the scripts below are uniquely
// associated with a language. Still, the number of queries in those
// languages is small and we didn't want to increase the code size and
// latency, so (at least for now) we do not treat them specially.
// The following three tests are, respectively, for Armenian, Syriac and
// Thaana.
if (kScriptOtherUtf8TwoBytes != GetScript("Ձ") ||
kScriptOtherUtf8TwoBytes != GetScript("ܔ") ||
kScriptOtherUtf8TwoBytes != GetScript("ށ")) {
test_successful = false;
}
// Unrecognized 3-byte script: CJK Unified Ideographs: not uniquely associated
// with a language.
if (kScriptOtherUtf8ThreeBytes != GetScript("万") ||
kScriptOtherUtf8ThreeBytes != GetScript("両")) {
test_successful = false;
}
// Unrecognized 4-byte script: CJK Unified Ideographs Extension C. Note:
// there is a nice UTF-8 encoder / decoder at https://mothereff.in/utf-8
if (kScriptOtherUtf8FourBytes != GetScript("\xF0\xAA\x9C\x94")) {
test_successful = false;
}
// Unrecognized 4-byte script: CJK Unified Ideographs Extension E
if (kScriptOtherUtf8FourBytes != GetScript("\xF0\xAB\xA0\xB5") ||
kScriptOtherUtf8FourBytes != GetScript("\xF0\xAC\xBA\xA1")) {
test_successful = false;
}
return PrintAndReturnStatus(test_successful);
}
} // namespace script_detector_test
} // namespace chrome_lang_id
// Runs the feature extraction tests.
int main(int argc, char **argv) {
const bool tests_successful =
chrome_lang_id::script_detector_test::TestGreekScript() &&
chrome_lang_id::script_detector_test::TestCyrillicScript() &&
chrome_lang_id::script_detector_test::TestHebrewScript() &&
chrome_lang_id::script_detector_test::TestArabicScript() &&
chrome_lang_id::script_detector_test::TestHangulJamoScript() &&
chrome_lang_id::script_detector_test::TestHiraganaScript() &&
chrome_lang_id::script_detector_test::TestKatakanaScript() &&
chrome_lang_id::script_detector_test::TestOtherScripts();
return tests_successful ? 0 : 1;
}