/usr/lib/python3/dist-packages/phonenumbers/phonenumbermatcher.py

"""Functionality to match phone numbers in a piece of text"""

# Based on original Java code:
#     java/src/com/google/i18n/phonenumbers/PhoneNumberMatch.java
#     java/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java
#   Copyright (C) 2011 The Libphonenumber Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re

# Extra regexp function; see README
from .re_util import fullmatch
from .util import UnicodeMixin, u, unicod, prnt
from .util import U_EMPTY_STRING, U_DASH, U_SEMICOLON, U_SLASH, U_X_LOWER, U_X_UPPER, U_PERCENT, U_STAR
from .unicode_util import Category, Block, is_letter
from .phonenumberutil import _MAX_LENGTH_FOR_NSN, _MAX_LENGTH_COUNTRY_CODE
from .phonenumberutil import _VALID_PUNCTUATION, _PLUS_CHARS, NON_DIGITS_PATTERN
from .phonenumberutil import _EXTN_PATTERNS_FOR_MATCHING, _REGEX_FLAGS
from .phonenumberutil import _SECOND_NUMBER_START_PATTERN, _UNWANTED_END_CHAR_PATTERN
from .phonenumberutil import MatchType, NumberParseException, PhoneNumberFormat
from .phonenumberutil import is_possible_number, is_valid_number, parse
from .phonenumberutil import normalize_digits_only, national_significant_number
from .phonenumberutil import _format_nsn_using_pattern, ndd_prefix_for_region
from .phonenumberutil import format_number, is_number_match, region_code_for_country_code
from .phonenumberutil import _maybe_strip_national_prefix_carrier_code
from .phonenumberutil import _choose_formatting_pattern_for_number
from .phonenumberutil import _formatting_rule_has_first_group_only
from .phonenumber import CountryCodeSource
from .phonemetadata import PhoneMetadata

# Import auto-generated data structures
try:
    from .data import _ALT_NUMBER_FORMATS
except ImportError:  # pragma no cover
    # Before the generated code exists, the data/ directory is empty.
    # The generation process imports this module, creating a circular
    # dependency.  The hack below works around this.
    import os
    import sys
    if os.path.basename(sys.argv[0]) in ("buildmetadatafromxml.py", "buildprefixdata.py"):
        prnt("Failed to import generated data (but OK as during autogeneration)", file=sys.stderr)
        _ALT_NUMBER_FORMATS = {}
    else:
        raise


def _limit(lower, upper):
    """Returns a regular expression quantifier with an upper and lower limit."""
    if ((lower < 0) or (upper <= 0) or (upper < lower)):
        raise Exception("Illegal argument to _limit")
    return unicod("{%d,%d}") % (lower, upper)

# Build the MATCHING_BRACKETS and PATTERN regular expression patterns. The
# building blocks below exist to make the patterns more easily understood.

_OPENING_PARENS = u("(\\[\uFF08\uFF3B")
_CLOSING_PARENS = u(")\\]\uFF09\uFF3D")
_NON_PARENS = u("[^") + _OPENING_PARENS + _CLOSING_PARENS + u("]")
# Limit on the number of pairs of brackets in a phone number.
_BRACKET_PAIR_LIMIT = _limit(0, 3)

# Pattern to check that brackets match. Opening brackets should be closed
# within a phone number.  This also checks that there is something inside the
# brackets. Having no brackets at all is also fine.
#
# An opening bracket at the beginning may not be closed, but subsequent ones
# should be.  It's also possible that the leading bracket was dropped, so we
# shouldn't be surprised if we see a closing bracket first. We limit the sets
# of brackets in a phone number to four.
_MATCHING_BRACKETS = re.compile(u("(?:[") + _OPENING_PARENS + u("])?") + u("(?:") + _NON_PARENS + u("+") +
                                u("[") + _CLOSING_PARENS + u("])?") +
                                _NON_PARENS + u("+") +
                                u("(?:[") + _OPENING_PARENS + u("]") + _NON_PARENS +
                                u("+[") + _CLOSING_PARENS + u("])") + _BRACKET_PAIR_LIMIT +
                                _NON_PARENS + u("*"))

# Limit on the number of leading (plus) characters.
_LEAD_LIMIT = _limit(0, 2)
# Limit on the number of consecutive punctuation characters.
_PUNCTUATION_LIMIT = _limit(0, 4)
# The maximum number of digits allowed in a digit-separated block. As we allow
# all digits in a single block, set high enough to accommodate the entire
# national number and the international country code.
_DIGIT_BLOCK_LIMIT = (_MAX_LENGTH_FOR_NSN + _MAX_LENGTH_COUNTRY_CODE)
# Limit on the number of blocks separated by punctuation. Use _DIGIT_BLOCK_LIMIT
# since some formats use spaces to separate each digit.
_BLOCK_LIMIT = _limit(0, _DIGIT_BLOCK_LIMIT)

# A punctuation sequence allowing white space.
_PUNCTUATION = u("[") + _VALID_PUNCTUATION + u("]") + _PUNCTUATION_LIMIT
# A digits block without punctuation.
_DIGIT_SEQUENCE = u("\\d") + _limit(1, _DIGIT_BLOCK_LIMIT)
# Punctuation that may be at the start of a phone number - brackets and plus signs.
_LEAD_CLASS_CHARS = _OPENING_PARENS + _PLUS_CHARS
_LEAD_CLASS = u("[") + _LEAD_CLASS_CHARS + u("]")
_LEAD_PATTERN = re.compile(_LEAD_CLASS)

# Phone number pattern allowing optional punctuation.
# This is the phone number pattern used by _find(), similar to
# phonenumberutil._VALID_PHONE_NUMBER, but with the following differences:
# - All captures are limited in order to place an upper bound to the text
#   matched by the pattern.
# - Leading punctuation / plus signs are limited.
# - Consecutive occurrences of punctuation are limited.
# - Number of digits is limited.
# - No whitespace is allowed at the start or end.
# - No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently
#   supported.
_PATTERN = re.compile(u("(?:") + _LEAD_CLASS + _PUNCTUATION + u(")") + _LEAD_LIMIT +
                      _DIGIT_SEQUENCE + u("(?:") + _PUNCTUATION + _DIGIT_SEQUENCE + u(")") + _BLOCK_LIMIT +
                      u("(?:") + _EXTN_PATTERNS_FOR_MATCHING + u(")?"),
                      _REGEX_FLAGS)

# Matches strings that look like publication pages. Example: "Computing
# Complete Answers to Queries in the Presence of Limited Access Patterns.
# Chen Li. VLDB J. 12(3): 211-227 (2003)."
#
# The string "211-227 (2003)" is not a telephone number.
_PUB_PAGES = re.compile(u("\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}"))

# Matches strings that look like dates using "/" as a separator. Examples:
# 3/10/2011, 31/10/96 or 08/31/95.
_SLASH_SEPARATED_DATES = re.compile(u("(?:(?:[0-3]?\\d/[01]?\\d)|(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}"))

# Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
# not include the trailing ":\d\d" -- that is covered by TIME_STAMPS_SUFFIX.
_TIME_STAMPS = re.compile(u("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d +[0-2]\\d$"))
_TIME_STAMPS_SUFFIX = re.compile(u(":[0-5]\\d"))

# Patterns used to extract phone numbers from a larger phone-number-like
# pattern. These are ordered according to specificity. For example,
# white-space is last since that is frequently used in numbers, not just to
# separate two numbers. We have separate patterns since we don't want to break
# up the phone-number-like text on more than one different kind of symbol at
# one time, although symbols of the same type (e.g. space) can be safely
# grouped together.
#
# Note that if there is a match, we will always check any text found up to the
# first match as well.
_INNER_MATCHES = (
    # Breaks on the slash - e.g. "651-234-2345/332-445-1234"
    re.compile(u("/+(.*)")),
    # Note that the bracket here is inside the capturing group, since we
    # consider it part of the phone number. Will match a pattern like "(650)
    # 223 3345 (754) 223 3321".
    re.compile(u("(\\([^(]*)")),
    # Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number."  We
    # require a space on either side of the hyphen for it to be considered a
    # separator.
    re.compile(u("(?u)(?:\\s-|-\\s)\\s*(.+)")),
    # Various types of wide hyphens. Note we have decided not to enforce a
    # space here, since it's possible that it's supposed to be used to break
    # two numbers without spaces, and we haven't seen many instances of it
    # used within a number.
    re.compile(u("(?u)[\u2012-\u2015\uFF0D]\\s*(.+)")),
    # Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
    re.compile(u("(?u)\\.+\\s*([^.]+)")),
    # Breaks on space - e.g. "3324451234 8002341234"
    re.compile(u("(?u)\\s+(\\S+)")))


class Leniency(object):
    """Leniency when finding potential phone numbers in text segments.

    The levels here are ordered in increasing strictness."""
    # Phone numbers accepted are possible (i.e. is_possible_number(number)) but
    # not necessarily valid (is_valid_number(number)).
    POSSIBLE = 0
    # Phone numbers accepted are both possible (is_possible_number(number))
    # and valid (is_valid_number(PhoneNumber)). Numbers written in national
    # format must have their national-prefix present if it is usually written
    # for a number of this type.
    VALID = 1
    # Phone numbers accepted are valid (i.e. is_valid_number(number)) and are
    # grouped in a possible way for this locale. For example, a US number
    # written as "65 02 53 00 00" and "650253 0000" are not accepted at this
    # leniency level, whereas "650 253 0000", "650 2530000" or "6502530000"
    # are.
    # Numbers with more than one '/' symbol in the national significant number
    # are also dropped at this level.
    #
    # Warning: This level might result in lower coverage especially for
    # regions outside of country code "+1". If you are not sure about which
    # level to use, email the discussion group
    # libphonenumber-discuss@googlegroups.com.
    STRICT_GROUPING = 2
    # Phone numbers accepted are valid (i.e. is_valid_number(number)) and are
    # grouped in the same way that we would have formatted it, or as a single
    # block. For example, a US number written as "650 2530000" is not accepted
    # at this leniency level, whereas "650 253 0000" or "6502530000" are.
    # Numbers with more than one '/' symbol are also dropped at this level.
    # Warning: This level might result in lower coverage especially for
    # regions outside of country code "+1". If you are not sure about which
    # level to use, email the discussion group
    # libphonenumber-discuss@googlegroups.com.
    EXACT_GROUPING = 3


def _verify(leniency, numobj, candidate):
    """Returns True if number is a verified number according to the
    leniency."""
    if leniency == Leniency.POSSIBLE:
        return is_possible_number(numobj)
    elif leniency == Leniency.VALID:
        if (not is_valid_number(numobj) or
            not _contains_only_valid_x_chars(numobj, candidate)):
            return False
        return _is_national_prefix_present_if_required(numobj)
    elif leniency == Leniency.STRICT_GROUPING:
        return _verify_strict_grouping(numobj, candidate)
    elif leniency == Leniency.EXACT_GROUPING:
        return _verify_exact_grouping(numobj, candidate)
    else:
        raise Exception("Error: unsupported Leniency value %s" % leniency)


def _verify_strict_grouping(numobj, candidate):
    if (not is_valid_number(numobj) or
        not _contains_only_valid_x_chars(numobj, candidate) or
        _contains_more_than_one_slash_in_national_number(numobj, candidate) or
        not _is_national_prefix_present_if_required(numobj)):
        return False
    return _check_number_grouping_is_valid(numobj, candidate,
                                           _all_number_groups_remain_grouped)


def _all_number_groups_remain_grouped(numobj, normalized_candidate, formatted_number_groups):
    """Returns True if the groups of digits found in our candidate phone number match our
    expectations.

    Arguments:
    numobj -- the original number we found when parsing
    normalized_candidate -- the candidate number, normalized to only contain ASCII digits,
          but with non-digits (spaces etc) retained
    expected_number_groups -- the groups of digits that we would expect to see if we
          formatted this number
    Returns True if expectations matched.
    """
    from_index = 0
    if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY:
        # First skip the country code if the normalized candidate contained it.
        country_code = str(numobj.country_code)
        from_index = normalized_candidate.find(country_code) + len(country_code)
    # Check each group of consecutive digits are not broken into separate
    # groupings in the candidate string.
    for ii, formatted_number_group in enumerate(formatted_number_groups):
        # Fails if the substring of normalized_candidate starting from
        # from_index doesn't contain the consecutive digits in
        # formatted_number_group.
        from_index = normalized_candidate.find(formatted_number_group, from_index)
        if from_index < 0:
            return False
        # Moves from_index forward.
        from_index += len(formatted_number_group)
        if (ii == 0 and from_index < len(normalized_candidate)):
            # We are at the position right after the NDC.  We get the region
            # used for formatting information based on the country code in the
            # phone number, rather than the number itself, as we do not need
            # to distinguish between different countries with the same country
            # calling code and this is faster.
            region = region_code_for_country_code(numobj.country_code)
            if (ndd_prefix_for_region(region, True) is not None and
                normalized_candidate[from_index].isdigit()):
                # This means there is no formatting symbol after the NDC. In
                # this case, we only accept the number if there is no
                # formatting symbol at all in the number, except for
                # extensions.  This is only important for countries with
                # national prefixes.
                nsn = national_significant_number(numobj)
                return normalized_candidate[(from_index - len(formatted_number_group)):].startswith(nsn)
    # The check here makes sure that we haven't mistakenly already used the extension to
    # match the last group of the subscriber number. Note the extension cannot have
    # formatting in-between digits.
    return (normalized_candidate[from_index:].find(numobj.extension or U_EMPTY_STRING) != -1)


def _verify_exact_grouping(numobj, candidate):
    if (not is_valid_number(numobj) or
        not _contains_only_valid_x_chars(numobj, candidate) or
        _contains_more_than_one_slash_in_national_number(numobj, candidate) or
        not _is_national_prefix_present_if_required(numobj)):
        return False
    return _check_number_grouping_is_valid(numobj, candidate,
                                           _all_number_groups_are_exactly_present)


def _all_number_groups_are_exactly_present(numobj, normalized_candidate, formatted_number_groups):
    """Returns True if the groups of digits found in our candidate phone number match our
    expectations.

    Arguments:
    numobj -- the original number we found when parsing
    normalized_candidate -- the candidate number, normalized to only contain ASCII digits,
          but with non-digits (spaces etc) retained
    expected_number_groups -- the groups of digits that we would expect to see if we
          formatted this number
    Returns True if expectations matched.
    """
    candidate_groups = re.split(NON_DIGITS_PATTERN, normalized_candidate)
    # Set this to the last group, skipping it if the number has an extension.
    if numobj.extension is not None:
        candidate_number_group_index = len(candidate_groups) - 2
    else:
        candidate_number_group_index = len(candidate_groups) - 1
    # First we check if the national significant number is formatted as a
    # block.  We use contains and not equals, since the national significant
    # number may be present with a prefix such as a national number prefix, or
    # the country code itself.
    if (len(candidate_groups) == 1 or
        candidate_groups[candidate_number_group_index].find(national_significant_number(numobj)) != -1):
        return True
    # Starting from the end, go through in reverse, excluding the first group,
    # and check the candidate and number groups are the same.
    formatted_number_group_index = len(formatted_number_groups) - 1
    while (formatted_number_group_index > 0 and candidate_number_group_index >= 0):
        if (candidate_groups[candidate_number_group_index] !=
            formatted_number_groups[formatted_number_group_index]):
            return False
        formatted_number_group_index -= 1
        candidate_number_group_index -= 1
    # Now check the first group. There may be a national prefix at the start, so we only check
    # that the candidate group ends with the formatted number group.
    return (candidate_number_group_index >= 0 and
            candidate_groups[candidate_number_group_index].endswith(formatted_number_groups[0]))


def _get_national_number_groups(numobj, formatting_pattern=None):
    """Helper method to get the national-number part of a number, formatted without any national
    prefix, and return it as a set of digit blocks that would be formatted together."""
    if formatting_pattern is None:
        # This will be in the format +CC-DG;ext=EXT where DG represents groups of digits.
        rfc3966_format = format_number(numobj, PhoneNumberFormat.RFC3966)
        # We remove the extension part from the formatted string before splitting
        # it into different groups.
        end_index = rfc3966_format.find(U_SEMICOLON)
        if end_index < 0:
            end_index = len(rfc3966_format)

        # The country-code will have a '-' following it.
        start_index = rfc3966_format.find(U_DASH) + 1
        return rfc3966_format[start_index:end_index].split(U_DASH)
    else:
        # We format the NSN only, and split that according to the separator.
        nsn = national_significant_number(numobj)
        return _format_nsn_using_pattern(nsn, formatting_pattern,
                                         PhoneNumberFormat.RFC3966).split(U_DASH)


def _check_number_grouping_is_valid(numobj, candidate, checker):
    # TODO: Evaluate how this works for other locales (testing has been
    # limited to NANPA regions) and optimise if necessary.
    normalized_candidate = normalize_digits_only(candidate, True)  # keep non-digits
    formatted_number_groups = _get_national_number_groups(numobj, None)
    if checker(numobj, normalized_candidate, formatted_number_groups):
        return True
    # If this didn't pass, see if there are any alternate formats, and try them instead.
    alternate_formats = _ALT_NUMBER_FORMATS.get(numobj.country_code, None)
    if alternate_formats is not None:
        for alternate_format in alternate_formats:
            formatted_number_groups = _get_national_number_groups(numobj, alternate_format)
            if checker(numobj, normalized_candidate, formatted_number_groups):
                return True
    return False


def _contains_more_than_one_slash_in_national_number(numobj, candidate):
    first_slash_in_body_index = candidate.find(U_SLASH)
    if first_slash_in_body_index < 0:
        # No slashes, this is okay.
        return False
    # Now look for a second one.
    second_slash_in_body_index = candidate.find(U_SLASH, first_slash_in_body_index + 1)
    if second_slash_in_body_index < 0:
        # Only one slash, this is okay.,
        return False

    # If the first slash is after the country calling code, this is permitted.
    candidate_has_country_code = (numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITH_PLUS_SIGN or
                                  numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITHOUT_PLUS_SIGN)
    if (candidate_has_country_code and
        normalize_digits_only(candidate[:first_slash_in_body_index]) ==
        unicod(numobj.country_code)):
        # Any more slashes and this is illegal.
        return (candidate[(second_slash_in_body_index + 1):].find(U_SLASH) != -1)
    return True


def _contains_only_valid_x_chars(numobj, candidate):
    # The characters 'x' and 'X' can be (1) a carrier code, in which case they
    # always precede the national significant number or (2) an extension sign,
    # in which case they always precede the extension number. We assume a
    # carrier code is more than 1 digit, so the first case has to have more
    # than 1 consecutive 'x' or 'X', whereas the second case can only have
    # exactly 1 'x' or 'X'. We ignore the character if it appears as the last
    # character of the string.
    ii = 0
    while ii < (len(candidate) - 1):
        if (candidate[ii] == U_X_LOWER or candidate[ii] == U_X_UPPER):
            next_char = candidate[ii + 1]
            if (next_char == U_X_LOWER or next_char == U_X_UPPER):
                # This is the carrier code case, in which the 'X's always
                # precede the national significant number.
                ii += 1
                if is_number_match(numobj, candidate[ii:]) != MatchType.NSN_MATCH:
                    return False
            # This is the extension sign case, in which the 'x' or 'X' should
            # always precede the extension number.
            elif normalize_digits_only(candidate[ii:]) != numobj.extension:
                return False
        ii += 1
    return True


def _is_national_prefix_present_if_required(numobj):
    # First, check how we deduced the country code. If it was written in
    # international format, then the national prefix is not required.
    if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY:
        return True
    phone_number_region = region_code_for_country_code(numobj.country_code)
    metadata = PhoneMetadata.metadata_for_region(phone_number_region, None)
    if metadata is None:
        return True
    # Check if a national prefix should be present when formatting this number.
    national_number = national_significant_number(numobj)
    format_rule = _choose_formatting_pattern_for_number(metadata.number_format,
                                                        national_number)
    # To do this, we check that a national prefix formatting rule was present
    # and that it wasn't just the first-group symbol ($1) with punctuation.
    if (format_rule is not None and
        format_rule.national_prefix_formatting_rule):
        if format_rule.national_prefix_optional_when_formatting:
            # The national-prefix is optional in these cases, so we don't need
            # to check if it was present.
            return True
        if _formatting_rule_has_first_group_only(format_rule.national_prefix_formatting_rule):
            # National Prefix not needed for this number.
            return True
        # Normalize the remainder.
        raw_input = normalize_digits_only(numobj.raw_input)
        # Check if we found a national prefix and/or carrier code at the start of the raw input,
        # and return the result.
        return _maybe_strip_national_prefix_carrier_code(raw_input, metadata)[2]
    return True


class PhoneNumberMatcher(object):
    """A stateful class that finds and extracts telephone numbers from text.

    Vanity numbers (phone numbers using alphabetic digits such as '1-800-SIX-FLAGS' are
    not found.

    This class is not thread-safe.
    """
    # The potential states of a PhoneNumberMatcher.
    _NOT_READY = 0
    _READY = 1
    _DONE = 2

    def __init__(self, text, region,
                 leniency=Leniency.VALID, max_tries=65535):
        """Creates a new instance.

        Arguments:
        text -- The character sequence that we will search, None for no text.
        country -- The country to assume for phone numbers not written in
              international format (with a leading plus, or with the
              international dialing prefix of the specified region). May be
              None or "ZZ" if only numbers with a leading plus should be
              considered.
        leniency -- The leniency to use when evaluating candidate phone
              numbers.
        max_tries -- The maximum number of invalid numbers to try before
              giving up on the text.  This is to cover degenerate cases where
              the text has a lot of false positives in it. Must be >= 0.
        """
        if leniency is None:
            raise ValueError("Need a leniency value")
        if int(max_tries) < 0:
            raise ValueError("Need max_tries to be positive int")
        # The text searched for phone numbers.
        self.text = text
        if self.text is None:
            self.text = U_EMPTY_STRING
        # The region (country) to assume for phone numbers without an
        # international prefix, possibly None.
        self.preferred_region = region
        # The degree of validation requested.
        self.leniency = leniency
        # The maximum number of retries after matching an invalid number.
        self._max_tries = int(max_tries)
        # The iteration tristate.
        self._state = PhoneNumberMatcher._NOT_READY
        # The last successful match, None unless in state _READY
        self._last_match = None
        # The next index to start searching at. Undefined in state _DONE
        self._search_index = 0

    def _find(self, index):
        """Attempts to find the next subsequence in the searched sequence on or after index
        that represents a phone number. Returns the next match, None if none was found.

        Arguments:
        index -- The search index to start searching at.
        Returns the phone number match found, None if none can be found.
        """
        match = _PATTERN.search(self.text, index)
        while self._max_tries > 0 and match is not None:
            start = match.start()
            candidate = self.text[start:match.end()]

            # Check for extra numbers at the end.
            # TODO: This is the place to start when trying to support
            # extraction of multiple phone number from split notations (+41 79
            # 123 45 67 / 68).
            candidate = self._trim_after_first_match(_SECOND_NUMBER_START_PATTERN,
                                                     candidate)

            match = self._extract_match(candidate, start)
            if match is not None:
                return match
            # Move along
            index = start + len(candidate)
            self._max_tries -= 1
            match = _PATTERN.search(self.text, index)
        return None

    def _trim_after_first_match(self, pattern, candidate):
        """Trims away any characters after the first match of pattern in
        candidate, returning the trimmed version."""
        trailing_chars_match = pattern.search(candidate)
        if trailing_chars_match:
            candidate = candidate[:trailing_chars_match.start()]
        return candidate

    @classmethod
    def _is_latin_letter(cls, letter):
        """Helper method to determine if a character is a Latin-script letter
        or not. For our purposes, combining marks should also return True
        since we assume they have been added to a preceding Latin character."""
        # Combining marks are a subset of non-spacing-mark
        if (not is_letter(letter) and
            Category.get(letter) != Category.NON_SPACING_MARK):
            return False
        block = Block.get(letter)
        return (block == Block.BASIC_LATIN or
                block == Block.LATIN_1_SUPPLEMENT or
                block == Block.LATIN_EXTENDED_A or
                block == Block.LATIN_EXTENDED_ADDITIONAL or
                block == Block.LATIN_EXTENDED_B or
                block == Block.COMBINING_DIACRITICAL_MARKS)

    @classmethod
    def _is_invalid_punctuation_symbol(cls, character):
        return (character == U_PERCENT or
                Category.get(character) == Category.CURRENCY_SYMBOL)

    def _extract_match(self, candidate, offset):
        """Attempts to extract a match from a candidate string.

        Arguments:
        candidate -- The candidate text that might contain a phone number.
        offset -- The offset of candidate within self.text
        Returns the match found, None if none can be found
        """
        # Skip a match that is more likely a publication page reference or a
        # date.
        if (_SLASH_SEPARATED_DATES.search(candidate)):
            return None

        # Skip potential time-stamps.
        if _TIME_STAMPS.search(candidate):
            following_text = self.text[offset + len(candidate):]
            if _TIME_STAMPS_SUFFIX.match(following_text):
                return None

        # Try to come up with a valid match given the entire candidate.
        match = self._parse_and_verify(candidate, offset)
        if match is not None:
            return match

        # If that failed, try to find an "inner match" -- there might be a
        # phone number within this candidate.
        return self._extract_inner_match(candidate, offset)

    def _extract_inner_match(self, candidate, offset):
        """Attempts to extract a match from candidate if the whole candidate
        does not qualify as a match.

        Arguments:
        candidate -- The candidate text that might contain a phone number
        offset -- The current offset of candidate within text
        Returns the match found, None if none can be found
        """
        for possible_inner_match in _INNER_MATCHES:
            group_match = possible_inner_match.search(candidate)
            is_first_match = True
            while group_match and self._max_tries > 0:
                if is_first_match:
                    # We should handle any group before this one too.
                    group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
                                                         candidate[:group_match.start()])
                    match = self._parse_and_verify(group, offset)
                    if match is not None:
                        return match
                    self._max_tries -= 1
                    is_first_match = False
                group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
                                                     group_match.group(1))
                match = self._parse_and_verify(group, offset + group_match.start(1))
                if match is not None:
                    return match
                self._max_tries -= 1
                group_match = possible_inner_match.search(candidate, group_match.start() + 1)
        return None

    def _parse_and_verify(self, candidate, offset):
        """Parses a phone number from the candidate using phonenumberutil.parse and
        verifies it matches the requested leniency. If parsing and verification succeed, a
        corresponding PhoneNumberMatch is returned, otherwise this method returns None.

        Arguments:
        candidate -- The candidate match.
        offset -- The offset of candidate within self.text.
        Returns the parsed and validated phone number match, or None.
        """
        try:
            # Check the candidate doesn't contain any formatting which would
            # indicate that it really isn't a phone number.
            if (not fullmatch(_MATCHING_BRACKETS, candidate) or _PUB_PAGES.search(candidate)):
                return None

            # If leniency is set to VALID or stricter, we also want to skip
            # numbers that are surrounded by Latin alphabetic characters, to
            # skip cases like abc8005001234 or 8005001234def.
            if self.leniency >= Leniency.VALID:
                # If the candidate is not at the start of the text, and does
                # not start with phone-number punctuation, check the previous
                # character
                if (offset > 0 and
                    not _LEAD_PATTERN.match(candidate)):
                    previous_char = self.text[offset - 1]
                    # We return None if it is a latin letter or an invalid
                    # punctuation symbol
                    if (self._is_invalid_punctuation_symbol(previous_char) or
                        self._is_latin_letter(previous_char)):
                        return None
                last_char_index = offset + len(candidate)
                if last_char_index < len(self.text):
                    next_char = self.text[last_char_index]
                    if (self._is_invalid_punctuation_symbol(next_char) or
                        self._is_latin_letter(next_char)):
                        return None

            numobj = parse(candidate, self.preferred_region, keep_raw_input=True)
            # Check Israel * numbers: these are a special case in that they
            # are four-digit numbers that our library supports, but they can
            # only be dialled with a leading *. Since we don't actually store
            # or detect the * in our phone number library, this means in
            # practice we detect most four digit numbers as being valid for
            # Israel. We are considering moving these numbers to
            # ShortNumberInfo instead, in which case this problem would go
            # away, but in the meantime we want to restrict the false matches
            # so we only allow these numbers if they are preceded by a
            # star. We enforce this for all leniency levels even though these
            # numbers are technically accepted by isPossibleNumber and
            # isValidNumber since we consider it to be a deficiency in those
            # methods that they accept these numbers without the *.
            # TODO: Remove this or make it significantly less hacky once we've
            # decided how to handle these short codes going forward in
            # ShortNumberInfo. We could use the formatting rules for instance,
            # but that would be slower.
            if (region_code_for_country_code(numobj.country_code) == "IL" and
                len(national_significant_number(numobj)) == 4 and
                (offset == 0 or (offset > 0 and self.text[offset - 1] != U_STAR))):
                # No match.
                return None
            if _verify(self.leniency, numobj, candidate):
                # We used parse(keep_raw_input=True) to create this number,
                # but for now we don't return the extra values parsed.
                # TODO: stop clearing all values here and switch all users
                # over to using raw_input rather than the raw_string of
                # PhoneNumberMatch.
                numobj.country_code_source = CountryCodeSource.UNSPECIFIED
                numobj.raw_input = None
                numobj.preferred_domestic_carrier_code = None
                return PhoneNumberMatch(offset, candidate, numobj)
        except NumberParseException:
            # ignore and continue
            pass
        return None

    def has_next(self):
        """Indicates whether there is another match available"""
        if self._state == PhoneNumberMatcher._NOT_READY:
            self._last_match = self._find(self._search_index)
            if self._last_match is None:
                self._state = PhoneNumberMatcher._DONE
            else:
                self._search_index = self._last_match.end
                self._state = PhoneNumberMatcher._READY
        return (self._state == PhoneNumberMatcher._READY)

    def next(self):
        """Return the next match; raises Exception if no next match available"""
        # Check the state and find the next match as a side-effect if necessary.
        if not self.has_next():
            raise StopIteration("No next match")
        # Don't retain that memory any longer than necessary.
        result = self._last_match
        self._last_match = None
        self._state = PhoneNumberMatcher._NOT_READY
        return result

    def __iter__(self):
        while self.has_next():
            yield self.next()


class PhoneNumberMatch(UnicodeMixin):
    """The immutable match of a phone number within a piece of text.

    Matches may be found using the find() method of PhoneNumberMatcher.

    A match consists of the phone number (in .number) as well as the .start
    and .end offsets of the corresponding subsequence of the searched
    text. Use .raw_string to obtain a copy of the matched subsequence.

    The following annotated example clarifies the relationship between the
    searched text, the match offsets, and the parsed number:

    >>> text = "Call me at +1 425 882-8080 for details."
    >>> country = "US"
    >>> import phonenumbers
    >>> matcher = phonenumbers.PhoneNumberMatcher(text, country)
    >>> matcher.has_next()
    True
    >>> m = matcher.next()  # Find the first phone number match
    >>> m.raw_string # contains the phone number as it appears in the text.
    "+1 425 882-8080"
    >>> (m.start, m.end)  # define the range of the matched subsequence.
    (11, 26)
    >>> text[m.start, m.end]
    "+1 425 882-8080"
    >>> phonenumberutil.parse("+1 425 882-8080", "US") == m.number
    True
    """
    def __init__(self, start, raw_string, numobj):
        if start < 0:
            raise Exception("Start index not >= 0")
        if raw_string is None or numobj is None:
            raise Exception("Invalid argument")
        # The start index into the text.
        self.start = start
        # The raw substring matched.
        self.raw_string = raw_string
        self.end = self.start + len(raw_string)
        # The matched phone number.
        self.number = numobj

    def __eq__(self, other):
        if not isinstance(other, PhoneNumberMatch):
            return False
        return (self.start == other.start and
                self.raw_string == other.raw_string and
                self.end == other.end and
                self.number == other.number)

    def __ne__(self, other):
        return not self.__eq__(other)

    def __repr__(self):
        return (unicod("PhoneNumberMatch(start=%r, raw_string=%r, numobj=%r)") %
                (self.start,
                 self.raw_string,
                 self.number))

    def __unicode__(self):
        return unicod("PhoneNumberMatch [%s,%s) %s") % (self.start, self.end, self.raw_string)
python3-phonenumbers 8.8.1-2 / usr / lib / python3 / dist-packages / phonenumbers / phonenumbermatcher.py