| // Copyright (C) 2025 The Libphonenumber Authors |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include <bits/stl_pair.h> |
| |
| #include <map> |
| #include <set> |
| #include <string> |
| #include <vector> |
| |
| #include "phonenumbers/regexp_cache.h" |
| #include "phonenumbers/regexp_factory.h" |
| #include "phonenumbers/stringutil.h" |
| |
| #ifndef I18N_PHONENUMBERS_REGEXPSANDMAPPINGS_H_ |
| #define I18N_PHONENUMBERS_REGEXPSANDMAPPINGS_H_ |
| |
| namespace i18n { |
| namespace phonenumbers { |
| |
| class PhoneNumberRegExpsAndMappings { |
| friend class PhoneContextParser; |
| friend class PhoneNumberNormalizer; |
| friend class PhoneNumberUtil; |
| friend class PhoneContextParserTest; |
| friend class PhoneNumberNormalizerTest; |
| |
| private: |
| void InitializeMapsAndSets(); |
| |
| // Helper initialiser method to create the regular-expression pattern to match |
| // extensions. Note that: |
| // - There are currently six capturing groups for the extension itself. If this |
| // number is changed, MaybeStripExtension needs to be updated. |
| // - The only capturing groups should be around the digits that you want to |
| // capture as part of the extension, or else parsing will fail! |
| static std::string CreateExtnPattern(bool for_parsing); |
| |
| // Helper method for constructing regular expressions for parsing. Creates an |
| // expression that captures up to max_length digits. |
| static std::string ExtnDigits(int max_length); |
| |
| // Regular expression of viable phone numbers. This is location independent. |
| // Checks we have at least three leading digits, and only valid punctuation, |
| // alpha characters and digits in the phone number. Does not include extension |
| // data. The symbol 'x' is allowed here as valid punctuation since it is often |
| // used as a placeholder for carrier codes, for example in Brazilian phone |
| // numbers. We also allow multiple plus-signs at the start. |
| // Corresponds to the following: |
| // [digits]{minLengthNsn}| |
| // plus_sign*(([punctuation]|[star])*[digits]){3,} |
| // ([punctuation]|[star]|[digits]|[alpha])* |
| // |
| // The first reg-ex is to allow short numbers (two digits long) to be parsed |
| // if they are entered as "15" etc, but only if there is no punctuation in |
| // them. The second expression restricts the number of digits to three or |
| // more, but then allows them to be in international form, and to have |
| // alpha-characters and punctuation. |
| const string valid_phone_number_; |
| |
| // Regexp of all possible ways to write extensions, for use when parsing. This |
| // will be run as a case-insensitive regexp match. Wide character versions are |
| // also provided after each ASCII version. |
| // For parsing, we are slightly more lenient in our interpretation than for |
| // matching. Here we allow "comma" and "semicolon" as possible extension |
| // indicators. When matching, these are hardly ever used to indicate this. |
| const string extn_patterns_for_parsing_; |
| |
| // Regular expressions of different parts of the phone-context parameter, |
| // following the syntax defined in RFC3966. |
| const std::string rfc3966_phone_digit_; |
| const std::string alphanum_; |
| const std::string rfc3966_domainlabel_; |
| const std::string rfc3966_toplabel_; |
| |
| scoped_ptr<const AbstractRegExpFactory> regexp_factory_; |
| scoped_ptr<RegExpCache> regexp_cache_; |
| |
| // A map that contains characters that are essential when dialling. That means |
| // any of the characters in this map must not be removed from a number when |
| // dialing, otherwise the call will not reach the intended destination. |
| std::map<char32, char> diallable_char_mappings_; |
| // These mappings map a character (key) to a specific digit that should |
| // replace it for normalization purposes. |
| std::map<char32, char> alpha_mappings_; |
| // For performance reasons, store a map of combining alpha_mappings with ASCII |
| // digits. |
| std::map<char32, char> alpha_phone_mappings_; |
| |
| // Separate map of all symbols that we wish to retain when formatting alpha |
| // numbers. This includes digits, ascii letters and number grouping symbols |
| // such as "-" and " ". |
| std::map<char32, char> all_plus_number_grouping_symbols_; |
| |
| // Map of country calling codes that use a mobile token before the area code. |
| // One example of when this is relevant is when determining the length of the |
| // national destination code, which should be the length of the area code plus |
| // the length of the mobile token. |
| std::map<int, char> mobile_token_mappings_; |
| |
| // Set of country codes that doesn't have national prefix, but it has area |
| // codes. |
| std::set<int> countries_without_national_prefix_with_area_codes_; |
| |
| // Set of country codes that have geographically assigned mobile numbers (see |
| // geo_mobile_countries_ below) which are not based on *area codes*. For |
| // example, in China mobile numbers start with a carrier indicator, and beyond |
| // that are geographically assigned: this carrier indicator is not considered |
| // to be an area code. |
| std::set<int> geo_mobile_countries_without_mobile_area_codes_; |
| |
| // Set of country calling codes that have geographically assigned mobile |
| // numbers. This may not be complete; we add calling codes case by case, as we |
| // find geographical mobile numbers or hear from user reports. |
| std::set<int> geo_mobile_countries_; |
| |
| // Pattern that makes it easy to distinguish whether a region has a single |
| // international dialing prefix or not. If a region has a single international |
| // prefix (e.g. 011 in USA), it will be represented as a string that contains |
| // a sequence of ASCII digits, and possibly a tilde, which signals waiting for |
| // the tone. If there are multiple available international prefixes in a |
| // region, they will be represented as a regex string that always contains one |
| // or more characters that are not ASCII digits or a tilde. |
| scoped_ptr<const RegExp> single_international_prefix_; |
| |
| scoped_ptr<const RegExp> digits_pattern_; |
| scoped_ptr<const RegExp> capturing_digit_pattern_; |
| scoped_ptr<const RegExp> capturing_ascii_digits_pattern_; |
| |
| // Regular expression of acceptable characters that may start a phone number |
| // for the purposes of parsing. This allows us to strip away meaningless |
| // prefixes to phone numbers that may be mistakenly given to us. This consists |
| // of digits, the plus symbol and arabic-indic digits. This does not contain |
| // alpha characters, although they may be used later in the number. It also |
| // does not include other punctuation, as this will be stripped later during |
| // parsing and is of no information value when parsing a number. The string |
| // starting with this valid character is captured. |
| // This corresponds to VALID_START_CHAR in the java version. |
| scoped_ptr<const RegExp> valid_start_char_pattern_; |
| |
| // Regular expression of valid characters before a marker that might indicate |
| // a second number. |
| scoped_ptr<const RegExp> capture_up_to_second_number_start_pattern_; |
| |
| // Regular expression of trailing characters that we want to remove. We remove |
| // all characters that are not alpha or numerical characters. The hash |
| // character is retained here, as it may signify the previous block was an |
| // extension. Note the capturing block at the start to capture the rest of the |
| // number if this was a match. |
| // This corresponds to UNWANTED_END_CHAR_PATTERN in the java version. |
| scoped_ptr<const RegExp> unwanted_end_char_pattern_; |
| |
| // Regular expression of groups of valid punctuation characters. |
| scoped_ptr<const RegExp> separator_pattern_; |
| |
| // Regexp of all possible ways to write extensions, for use when finding phone |
| // numbers in text. This will be run as a case-insensitive regexp match. Wide |
| // character versions are also provided after each ASCII version. |
| const string extn_patterns_for_matching_; |
| |
| // Regexp of all known extension prefixes used by different regions followed |
| // by 1 or more valid digits, for use when parsing. |
| scoped_ptr<const RegExp> extn_pattern_; |
| |
| // We append optionally the extension pattern to the end here, as a valid |
| // phone number may have an extension prefix appended, followed by 1 or more |
| // digits. |
| scoped_ptr<const RegExp> valid_phone_number_pattern_; |
| |
| // We use this pattern to check if the phone number has at least three letters |
| // in it - if so, then we treat it as a number where some phone-number digits |
| // are represented by letters. |
| scoped_ptr<const RegExp> valid_alpha_phone_pattern_; |
| |
| scoped_ptr<const RegExp> first_group_capturing_pattern_; |
| |
| scoped_ptr<const RegExp> carrier_code_pattern_; |
| |
| scoped_ptr<const RegExp> plus_chars_pattern_; |
| |
| // Regular expression of valid global-number-digits for the phone-context |
| // parameter, following the syntax defined in RFC3966. |
| std::unique_ptr<const RegExp> rfc3966_global_number_digits_pattern_; |
| |
| // Regular expression of valid domainname for the phone-context parameter, |
| // following the syntax defined in RFC3966. |
| std::unique_ptr<const RegExp> rfc3966_domainname_pattern_; |
| |
| PhoneNumberRegExpsAndMappings(); |
| |
| // This type is neither copyable nor movable. |
| PhoneNumberRegExpsAndMappings(const PhoneNumberRegExpsAndMappings&) = delete; |
| PhoneNumberRegExpsAndMappings& operator=( |
| const PhoneNumberRegExpsAndMappings&) = delete; |
| }; |
| |
| } // namespace phonenumbers |
| } // namespace i18n |
| |
| #endif // I18N_PHONENUMBERS_REGEXPSANDMAPPINGS_H_ |