/usr/lib/python3/dist-packages/phonenumbers/phonenumbermatcher.py is in python3-phonenumbers 8.8.1-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 | """Functionality to match phone numbers in a piece of text"""
# Based on original Java code:
# java/src/com/google/i18n/phonenumbers/PhoneNumberMatch.java
# java/src/com/google/i18n/phonenumbers/PhoneNumberMatcher.java
# Copyright (C) 2011 The Libphonenumber Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import re
# Extra regexp function; see README
from .re_util import fullmatch
from .util import UnicodeMixin, u, unicod, prnt
from .util import U_EMPTY_STRING, U_DASH, U_SEMICOLON, U_SLASH, U_X_LOWER, U_X_UPPER, U_PERCENT, U_STAR
from .unicode_util import Category, Block, is_letter
from .phonenumberutil import _MAX_LENGTH_FOR_NSN, _MAX_LENGTH_COUNTRY_CODE
from .phonenumberutil import _VALID_PUNCTUATION, _PLUS_CHARS, NON_DIGITS_PATTERN
from .phonenumberutil import _EXTN_PATTERNS_FOR_MATCHING, _REGEX_FLAGS
from .phonenumberutil import _SECOND_NUMBER_START_PATTERN, _UNWANTED_END_CHAR_PATTERN
from .phonenumberutil import MatchType, NumberParseException, PhoneNumberFormat
from .phonenumberutil import is_possible_number, is_valid_number, parse
from .phonenumberutil import normalize_digits_only, national_significant_number
from .phonenumberutil import _format_nsn_using_pattern, ndd_prefix_for_region
from .phonenumberutil import format_number, is_number_match, region_code_for_country_code
from .phonenumberutil import _maybe_strip_national_prefix_carrier_code
from .phonenumberutil import _choose_formatting_pattern_for_number
from .phonenumberutil import _formatting_rule_has_first_group_only
from .phonenumber import CountryCodeSource
from .phonemetadata import PhoneMetadata
# Import auto-generated data structures
try:
from .data import _ALT_NUMBER_FORMATS
except ImportError: # pragma no cover
# Before the generated code exists, the data/ directory is empty.
# The generation process imports this module, creating a circular
# dependency. The hack below works around this.
import os
import sys
if os.path.basename(sys.argv[0]) in ("buildmetadatafromxml.py", "buildprefixdata.py"):
prnt("Failed to import generated data (but OK as during autogeneration)", file=sys.stderr)
_ALT_NUMBER_FORMATS = {}
else:
raise
def _limit(lower, upper):
"""Returns a regular expression quantifier with an upper and lower limit."""
if ((lower < 0) or (upper <= 0) or (upper < lower)):
raise Exception("Illegal argument to _limit")
return unicod("{%d,%d}") % (lower, upper)
# Build the MATCHING_BRACKETS and PATTERN regular expression patterns. The
# building blocks below exist to make the patterns more easily understood.
_OPENING_PARENS = u("(\\[\uFF08\uFF3B")
_CLOSING_PARENS = u(")\\]\uFF09\uFF3D")
_NON_PARENS = u("[^") + _OPENING_PARENS + _CLOSING_PARENS + u("]")
# Limit on the number of pairs of brackets in a phone number.
_BRACKET_PAIR_LIMIT = _limit(0, 3)
# Pattern to check that brackets match. Opening brackets should be closed
# within a phone number. This also checks that there is something inside the
# brackets. Having no brackets at all is also fine.
#
# An opening bracket at the beginning may not be closed, but subsequent ones
# should be. It's also possible that the leading bracket was dropped, so we
# shouldn't be surprised if we see a closing bracket first. We limit the sets
# of brackets in a phone number to four.
_MATCHING_BRACKETS = re.compile(u("(?:[") + _OPENING_PARENS + u("])?") + u("(?:") + _NON_PARENS + u("+") +
u("[") + _CLOSING_PARENS + u("])?") +
_NON_PARENS + u("+") +
u("(?:[") + _OPENING_PARENS + u("]") + _NON_PARENS +
u("+[") + _CLOSING_PARENS + u("])") + _BRACKET_PAIR_LIMIT +
_NON_PARENS + u("*"))
# Limit on the number of leading (plus) characters.
_LEAD_LIMIT = _limit(0, 2)
# Limit on the number of consecutive punctuation characters.
_PUNCTUATION_LIMIT = _limit(0, 4)
# The maximum number of digits allowed in a digit-separated block. As we allow
# all digits in a single block, set high enough to accommodate the entire
# national number and the international country code.
_DIGIT_BLOCK_LIMIT = (_MAX_LENGTH_FOR_NSN + _MAX_LENGTH_COUNTRY_CODE)
# Limit on the number of blocks separated by punctuation. Use _DIGIT_BLOCK_LIMIT
# since some formats use spaces to separate each digit.
_BLOCK_LIMIT = _limit(0, _DIGIT_BLOCK_LIMIT)
# A punctuation sequence allowing white space.
_PUNCTUATION = u("[") + _VALID_PUNCTUATION + u("]") + _PUNCTUATION_LIMIT
# A digits block without punctuation.
_DIGIT_SEQUENCE = u("\\d") + _limit(1, _DIGIT_BLOCK_LIMIT)
# Punctuation that may be at the start of a phone number - brackets and plus signs.
_LEAD_CLASS_CHARS = _OPENING_PARENS + _PLUS_CHARS
_LEAD_CLASS = u("[") + _LEAD_CLASS_CHARS + u("]")
_LEAD_PATTERN = re.compile(_LEAD_CLASS)
# Phone number pattern allowing optional punctuation.
# This is the phone number pattern used by _find(), similar to
# phonenumberutil._VALID_PHONE_NUMBER, but with the following differences:
# - All captures are limited in order to place an upper bound to the text
# matched by the pattern.
# - Leading punctuation / plus signs are limited.
# - Consecutive occurrences of punctuation are limited.
# - Number of digits is limited.
# - No whitespace is allowed at the start or end.
# - No alpha digits (vanity numbers such as 1-800-SIX-FLAGS) are currently
# supported.
_PATTERN = re.compile(u("(?:") + _LEAD_CLASS + _PUNCTUATION + u(")") + _LEAD_LIMIT +
_DIGIT_SEQUENCE + u("(?:") + _PUNCTUATION + _DIGIT_SEQUENCE + u(")") + _BLOCK_LIMIT +
u("(?:") + _EXTN_PATTERNS_FOR_MATCHING + u(")?"),
_REGEX_FLAGS)
# Matches strings that look like publication pages. Example: "Computing
# Complete Answers to Queries in the Presence of Limited Access Patterns.
# Chen Li. VLDB J. 12(3): 211-227 (2003)."
#
# The string "211-227 (2003)" is not a telephone number.
_PUB_PAGES = re.compile(u("\\d{1,5}-+\\d{1,5}\\s{0,4}\\(\\d{1,4}"))
# Matches strings that look like dates using "/" as a separator. Examples:
# 3/10/2011, 31/10/96 or 08/31/95.
_SLASH_SEPARATED_DATES = re.compile(u("(?:(?:[0-3]?\\d/[01]?\\d)|(?:[01]?\\d/[0-3]?\\d))/(?:[12]\\d)?\\d{2}"))
# Matches timestamps. Examples: "2012-01-02 08:00". Note that the reg-ex does
# not include the trailing ":\d\d" -- that is covered by TIME_STAMPS_SUFFIX.
_TIME_STAMPS = re.compile(u("[12]\\d{3}[-/]?[01]\\d[-/]?[0-3]\\d +[0-2]\\d$"))
_TIME_STAMPS_SUFFIX = re.compile(u(":[0-5]\\d"))
# Patterns used to extract phone numbers from a larger phone-number-like
# pattern. These are ordered according to specificity. For example,
# white-space is last since that is frequently used in numbers, not just to
# separate two numbers. We have separate patterns since we don't want to break
# up the phone-number-like text on more than one different kind of symbol at
# one time, although symbols of the same type (e.g. space) can be safely
# grouped together.
#
# Note that if there is a match, we will always check any text found up to the
# first match as well.
_INNER_MATCHES = (
# Breaks on the slash - e.g. "651-234-2345/332-445-1234"
re.compile(u("/+(.*)")),
# Note that the bracket here is inside the capturing group, since we
# consider it part of the phone number. Will match a pattern like "(650)
# 223 3345 (754) 223 3321".
re.compile(u("(\\([^(]*)")),
# Breaks on a hyphen - e.g. "12345 - 332-445-1234 is my number." We
# require a space on either side of the hyphen for it to be considered a
# separator.
re.compile(u("(?u)(?:\\s-|-\\s)\\s*(.+)")),
# Various types of wide hyphens. Note we have decided not to enforce a
# space here, since it's possible that it's supposed to be used to break
# two numbers without spaces, and we haven't seen many instances of it
# used within a number.
re.compile(u("(?u)[\u2012-\u2015\uFF0D]\\s*(.+)")),
# Breaks on a full stop - e.g. "12345. 332-445-1234 is my number."
re.compile(u("(?u)\\.+\\s*([^.]+)")),
# Breaks on space - e.g. "3324451234 8002341234"
re.compile(u("(?u)\\s+(\\S+)")))
class Leniency(object):
"""Leniency when finding potential phone numbers in text segments.
The levels here are ordered in increasing strictness."""
# Phone numbers accepted are possible (i.e. is_possible_number(number)) but
# not necessarily valid (is_valid_number(number)).
POSSIBLE = 0
# Phone numbers accepted are both possible (is_possible_number(number))
# and valid (is_valid_number(PhoneNumber)). Numbers written in national
# format must have their national-prefix present if it is usually written
# for a number of this type.
VALID = 1
# Phone numbers accepted are valid (i.e. is_valid_number(number)) and are
# grouped in a possible way for this locale. For example, a US number
# written as "65 02 53 00 00" and "650253 0000" are not accepted at this
# leniency level, whereas "650 253 0000", "650 2530000" or "6502530000"
# are.
# Numbers with more than one '/' symbol in the national significant number
# are also dropped at this level.
#
# Warning: This level might result in lower coverage especially for
# regions outside of country code "+1". If you are not sure about which
# level to use, email the discussion group
# libphonenumber-discuss@googlegroups.com.
STRICT_GROUPING = 2
# Phone numbers accepted are valid (i.e. is_valid_number(number)) and are
# grouped in the same way that we would have formatted it, or as a single
# block. For example, a US number written as "650 2530000" is not accepted
# at this leniency level, whereas "650 253 0000" or "6502530000" are.
# Numbers with more than one '/' symbol are also dropped at this level.
# Warning: This level might result in lower coverage especially for
# regions outside of country code "+1". If you are not sure about which
# level to use, email the discussion group
# libphonenumber-discuss@googlegroups.com.
EXACT_GROUPING = 3
def _verify(leniency, numobj, candidate):
"""Returns True if number is a verified number according to the
leniency."""
if leniency == Leniency.POSSIBLE:
return is_possible_number(numobj)
elif leniency == Leniency.VALID:
if (not is_valid_number(numobj) or
not _contains_only_valid_x_chars(numobj, candidate)):
return False
return _is_national_prefix_present_if_required(numobj)
elif leniency == Leniency.STRICT_GROUPING:
return _verify_strict_grouping(numobj, candidate)
elif leniency == Leniency.EXACT_GROUPING:
return _verify_exact_grouping(numobj, candidate)
else:
raise Exception("Error: unsupported Leniency value %s" % leniency)
def _verify_strict_grouping(numobj, candidate):
if (not is_valid_number(numobj) or
not _contains_only_valid_x_chars(numobj, candidate) or
_contains_more_than_one_slash_in_national_number(numobj, candidate) or
not _is_national_prefix_present_if_required(numobj)):
return False
return _check_number_grouping_is_valid(numobj, candidate,
_all_number_groups_remain_grouped)
def _all_number_groups_remain_grouped(numobj, normalized_candidate, formatted_number_groups):
"""Returns True if the groups of digits found in our candidate phone number match our
expectations.
Arguments:
numobj -- the original number we found when parsing
normalized_candidate -- the candidate number, normalized to only contain ASCII digits,
but with non-digits (spaces etc) retained
expected_number_groups -- the groups of digits that we would expect to see if we
formatted this number
Returns True if expectations matched.
"""
from_index = 0
if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY:
# First skip the country code if the normalized candidate contained it.
country_code = str(numobj.country_code)
from_index = normalized_candidate.find(country_code) + len(country_code)
# Check each group of consecutive digits are not broken into separate
# groupings in the candidate string.
for ii, formatted_number_group in enumerate(formatted_number_groups):
# Fails if the substring of normalized_candidate starting from
# from_index doesn't contain the consecutive digits in
# formatted_number_group.
from_index = normalized_candidate.find(formatted_number_group, from_index)
if from_index < 0:
return False
# Moves from_index forward.
from_index += len(formatted_number_group)
if (ii == 0 and from_index < len(normalized_candidate)):
# We are at the position right after the NDC. We get the region
# used for formatting information based on the country code in the
# phone number, rather than the number itself, as we do not need
# to distinguish between different countries with the same country
# calling code and this is faster.
region = region_code_for_country_code(numobj.country_code)
if (ndd_prefix_for_region(region, True) is not None and
normalized_candidate[from_index].isdigit()):
# This means there is no formatting symbol after the NDC. In
# this case, we only accept the number if there is no
# formatting symbol at all in the number, except for
# extensions. This is only important for countries with
# national prefixes.
nsn = national_significant_number(numobj)
return normalized_candidate[(from_index - len(formatted_number_group)):].startswith(nsn)
# The check here makes sure that we haven't mistakenly already used the extension to
# match the last group of the subscriber number. Note the extension cannot have
# formatting in-between digits.
return (normalized_candidate[from_index:].find(numobj.extension or U_EMPTY_STRING) != -1)
def _verify_exact_grouping(numobj, candidate):
if (not is_valid_number(numobj) or
not _contains_only_valid_x_chars(numobj, candidate) or
_contains_more_than_one_slash_in_national_number(numobj, candidate) or
not _is_national_prefix_present_if_required(numobj)):
return False
return _check_number_grouping_is_valid(numobj, candidate,
_all_number_groups_are_exactly_present)
def _all_number_groups_are_exactly_present(numobj, normalized_candidate, formatted_number_groups):
"""Returns True if the groups of digits found in our candidate phone number match our
expectations.
Arguments:
numobj -- the original number we found when parsing
normalized_candidate -- the candidate number, normalized to only contain ASCII digits,
but with non-digits (spaces etc) retained
expected_number_groups -- the groups of digits that we would expect to see if we
formatted this number
Returns True if expectations matched.
"""
candidate_groups = re.split(NON_DIGITS_PATTERN, normalized_candidate)
# Set this to the last group, skipping it if the number has an extension.
if numobj.extension is not None:
candidate_number_group_index = len(candidate_groups) - 2
else:
candidate_number_group_index = len(candidate_groups) - 1
# First we check if the national significant number is formatted as a
# block. We use contains and not equals, since the national significant
# number may be present with a prefix such as a national number prefix, or
# the country code itself.
if (len(candidate_groups) == 1 or
candidate_groups[candidate_number_group_index].find(national_significant_number(numobj)) != -1):
return True
# Starting from the end, go through in reverse, excluding the first group,
# and check the candidate and number groups are the same.
formatted_number_group_index = len(formatted_number_groups) - 1
while (formatted_number_group_index > 0 and candidate_number_group_index >= 0):
if (candidate_groups[candidate_number_group_index] !=
formatted_number_groups[formatted_number_group_index]):
return False
formatted_number_group_index -= 1
candidate_number_group_index -= 1
# Now check the first group. There may be a national prefix at the start, so we only check
# that the candidate group ends with the formatted number group.
return (candidate_number_group_index >= 0 and
candidate_groups[candidate_number_group_index].endswith(formatted_number_groups[0]))
def _get_national_number_groups(numobj, formatting_pattern=None):
"""Helper method to get the national-number part of a number, formatted without any national
prefix, and return it as a set of digit blocks that would be formatted together."""
if formatting_pattern is None:
# This will be in the format +CC-DG;ext=EXT where DG represents groups of digits.
rfc3966_format = format_number(numobj, PhoneNumberFormat.RFC3966)
# We remove the extension part from the formatted string before splitting
# it into different groups.
end_index = rfc3966_format.find(U_SEMICOLON)
if end_index < 0:
end_index = len(rfc3966_format)
# The country-code will have a '-' following it.
start_index = rfc3966_format.find(U_DASH) + 1
return rfc3966_format[start_index:end_index].split(U_DASH)
else:
# We format the NSN only, and split that according to the separator.
nsn = national_significant_number(numobj)
return _format_nsn_using_pattern(nsn, formatting_pattern,
PhoneNumberFormat.RFC3966).split(U_DASH)
def _check_number_grouping_is_valid(numobj, candidate, checker):
# TODO: Evaluate how this works for other locales (testing has been
# limited to NANPA regions) and optimise if necessary.
normalized_candidate = normalize_digits_only(candidate, True) # keep non-digits
formatted_number_groups = _get_national_number_groups(numobj, None)
if checker(numobj, normalized_candidate, formatted_number_groups):
return True
# If this didn't pass, see if there are any alternate formats, and try them instead.
alternate_formats = _ALT_NUMBER_FORMATS.get(numobj.country_code, None)
if alternate_formats is not None:
for alternate_format in alternate_formats:
formatted_number_groups = _get_national_number_groups(numobj, alternate_format)
if checker(numobj, normalized_candidate, formatted_number_groups):
return True
return False
def _contains_more_than_one_slash_in_national_number(numobj, candidate):
first_slash_in_body_index = candidate.find(U_SLASH)
if first_slash_in_body_index < 0:
# No slashes, this is okay.
return False
# Now look for a second one.
second_slash_in_body_index = candidate.find(U_SLASH, first_slash_in_body_index + 1)
if second_slash_in_body_index < 0:
# Only one slash, this is okay.,
return False
# If the first slash is after the country calling code, this is permitted.
candidate_has_country_code = (numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITH_PLUS_SIGN or
numobj.country_code_source == CountryCodeSource.FROM_NUMBER_WITHOUT_PLUS_SIGN)
if (candidate_has_country_code and
normalize_digits_only(candidate[:first_slash_in_body_index]) ==
unicod(numobj.country_code)):
# Any more slashes and this is illegal.
return (candidate[(second_slash_in_body_index + 1):].find(U_SLASH) != -1)
return True
def _contains_only_valid_x_chars(numobj, candidate):
# The characters 'x' and 'X' can be (1) a carrier code, in which case they
# always precede the national significant number or (2) an extension sign,
# in which case they always precede the extension number. We assume a
# carrier code is more than 1 digit, so the first case has to have more
# than 1 consecutive 'x' or 'X', whereas the second case can only have
# exactly 1 'x' or 'X'. We ignore the character if it appears as the last
# character of the string.
ii = 0
while ii < (len(candidate) - 1):
if (candidate[ii] == U_X_LOWER or candidate[ii] == U_X_UPPER):
next_char = candidate[ii + 1]
if (next_char == U_X_LOWER or next_char == U_X_UPPER):
# This is the carrier code case, in which the 'X's always
# precede the national significant number.
ii += 1
if is_number_match(numobj, candidate[ii:]) != MatchType.NSN_MATCH:
return False
# This is the extension sign case, in which the 'x' or 'X' should
# always precede the extension number.
elif normalize_digits_only(candidate[ii:]) != numobj.extension:
return False
ii += 1
return True
def _is_national_prefix_present_if_required(numobj):
# First, check how we deduced the country code. If it was written in
# international format, then the national prefix is not required.
if numobj.country_code_source != CountryCodeSource.FROM_DEFAULT_COUNTRY:
return True
phone_number_region = region_code_for_country_code(numobj.country_code)
metadata = PhoneMetadata.metadata_for_region(phone_number_region, None)
if metadata is None:
return True
# Check if a national prefix should be present when formatting this number.
national_number = national_significant_number(numobj)
format_rule = _choose_formatting_pattern_for_number(metadata.number_format,
national_number)
# To do this, we check that a national prefix formatting rule was present
# and that it wasn't just the first-group symbol ($1) with punctuation.
if (format_rule is not None and
format_rule.national_prefix_formatting_rule):
if format_rule.national_prefix_optional_when_formatting:
# The national-prefix is optional in these cases, so we don't need
# to check if it was present.
return True
if _formatting_rule_has_first_group_only(format_rule.national_prefix_formatting_rule):
# National Prefix not needed for this number.
return True
# Normalize the remainder.
raw_input = normalize_digits_only(numobj.raw_input)
# Check if we found a national prefix and/or carrier code at the start of the raw input,
# and return the result.
return _maybe_strip_national_prefix_carrier_code(raw_input, metadata)[2]
return True
class PhoneNumberMatcher(object):
"""A stateful class that finds and extracts telephone numbers from text.
Vanity numbers (phone numbers using alphabetic digits such as '1-800-SIX-FLAGS' are
not found.
This class is not thread-safe.
"""
# The potential states of a PhoneNumberMatcher.
_NOT_READY = 0
_READY = 1
_DONE = 2
def __init__(self, text, region,
leniency=Leniency.VALID, max_tries=65535):
"""Creates a new instance.
Arguments:
text -- The character sequence that we will search, None for no text.
country -- The country to assume for phone numbers not written in
international format (with a leading plus, or with the
international dialing prefix of the specified region). May be
None or "ZZ" if only numbers with a leading plus should be
considered.
leniency -- The leniency to use when evaluating candidate phone
numbers.
max_tries -- The maximum number of invalid numbers to try before
giving up on the text. This is to cover degenerate cases where
the text has a lot of false positives in it. Must be >= 0.
"""
if leniency is None:
raise ValueError("Need a leniency value")
if int(max_tries) < 0:
raise ValueError("Need max_tries to be positive int")
# The text searched for phone numbers.
self.text = text
if self.text is None:
self.text = U_EMPTY_STRING
# The region (country) to assume for phone numbers without an
# international prefix, possibly None.
self.preferred_region = region
# The degree of validation requested.
self.leniency = leniency
# The maximum number of retries after matching an invalid number.
self._max_tries = int(max_tries)
# The iteration tristate.
self._state = PhoneNumberMatcher._NOT_READY
# The last successful match, None unless in state _READY
self._last_match = None
# The next index to start searching at. Undefined in state _DONE
self._search_index = 0
def _find(self, index):
"""Attempts to find the next subsequence in the searched sequence on or after index
that represents a phone number. Returns the next match, None if none was found.
Arguments:
index -- The search index to start searching at.
Returns the phone number match found, None if none can be found.
"""
match = _PATTERN.search(self.text, index)
while self._max_tries > 0 and match is not None:
start = match.start()
candidate = self.text[start:match.end()]
# Check for extra numbers at the end.
# TODO: This is the place to start when trying to support
# extraction of multiple phone number from split notations (+41 79
# 123 45 67 / 68).
candidate = self._trim_after_first_match(_SECOND_NUMBER_START_PATTERN,
candidate)
match = self._extract_match(candidate, start)
if match is not None:
return match
# Move along
index = start + len(candidate)
self._max_tries -= 1
match = _PATTERN.search(self.text, index)
return None
def _trim_after_first_match(self, pattern, candidate):
"""Trims away any characters after the first match of pattern in
candidate, returning the trimmed version."""
trailing_chars_match = pattern.search(candidate)
if trailing_chars_match:
candidate = candidate[:trailing_chars_match.start()]
return candidate
@classmethod
def _is_latin_letter(cls, letter):
"""Helper method to determine if a character is a Latin-script letter
or not. For our purposes, combining marks should also return True
since we assume they have been added to a preceding Latin character."""
# Combining marks are a subset of non-spacing-mark
if (not is_letter(letter) and
Category.get(letter) != Category.NON_SPACING_MARK):
return False
block = Block.get(letter)
return (block == Block.BASIC_LATIN or
block == Block.LATIN_1_SUPPLEMENT or
block == Block.LATIN_EXTENDED_A or
block == Block.LATIN_EXTENDED_ADDITIONAL or
block == Block.LATIN_EXTENDED_B or
block == Block.COMBINING_DIACRITICAL_MARKS)
@classmethod
def _is_invalid_punctuation_symbol(cls, character):
return (character == U_PERCENT or
Category.get(character) == Category.CURRENCY_SYMBOL)
def _extract_match(self, candidate, offset):
"""Attempts to extract a match from a candidate string.
Arguments:
candidate -- The candidate text that might contain a phone number.
offset -- The offset of candidate within self.text
Returns the match found, None if none can be found
"""
# Skip a match that is more likely a publication page reference or a
# date.
if (_SLASH_SEPARATED_DATES.search(candidate)):
return None
# Skip potential time-stamps.
if _TIME_STAMPS.search(candidate):
following_text = self.text[offset + len(candidate):]
if _TIME_STAMPS_SUFFIX.match(following_text):
return None
# Try to come up with a valid match given the entire candidate.
match = self._parse_and_verify(candidate, offset)
if match is not None:
return match
# If that failed, try to find an "inner match" -- there might be a
# phone number within this candidate.
return self._extract_inner_match(candidate, offset)
def _extract_inner_match(self, candidate, offset):
"""Attempts to extract a match from candidate if the whole candidate
does not qualify as a match.
Arguments:
candidate -- The candidate text that might contain a phone number
offset -- The current offset of candidate within text
Returns the match found, None if none can be found
"""
for possible_inner_match in _INNER_MATCHES:
group_match = possible_inner_match.search(candidate)
is_first_match = True
while group_match and self._max_tries > 0:
if is_first_match:
# We should handle any group before this one too.
group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
candidate[:group_match.start()])
match = self._parse_and_verify(group, offset)
if match is not None:
return match
self._max_tries -= 1
is_first_match = False
group = self._trim_after_first_match(_UNWANTED_END_CHAR_PATTERN,
group_match.group(1))
match = self._parse_and_verify(group, offset + group_match.start(1))
if match is not None:
return match
self._max_tries -= 1
group_match = possible_inner_match.search(candidate, group_match.start() + 1)
return None
def _parse_and_verify(self, candidate, offset):
"""Parses a phone number from the candidate using phonenumberutil.parse and
verifies it matches the requested leniency. If parsing and verification succeed, a
corresponding PhoneNumberMatch is returned, otherwise this method returns None.
Arguments:
candidate -- The candidate match.
offset -- The offset of candidate within self.text.
Returns the parsed and validated phone number match, or None.
"""
try:
# Check the candidate doesn't contain any formatting which would
# indicate that it really isn't a phone number.
if (not fullmatch(_MATCHING_BRACKETS, candidate) or _PUB_PAGES.search(candidate)):
return None
# If leniency is set to VALID or stricter, we also want to skip
# numbers that are surrounded by Latin alphabetic characters, to
# skip cases like abc8005001234 or 8005001234def.
if self.leniency >= Leniency.VALID:
# If the candidate is not at the start of the text, and does
# not start with phone-number punctuation, check the previous
# character
if (offset > 0 and
not _LEAD_PATTERN.match(candidate)):
previous_char = self.text[offset - 1]
# We return None if it is a latin letter or an invalid
# punctuation symbol
if (self._is_invalid_punctuation_symbol(previous_char) or
self._is_latin_letter(previous_char)):
return None
last_char_index = offset + len(candidate)
if last_char_index < len(self.text):
next_char = self.text[last_char_index]
if (self._is_invalid_punctuation_symbol(next_char) or
self._is_latin_letter(next_char)):
return None
numobj = parse(candidate, self.preferred_region, keep_raw_input=True)
# Check Israel * numbers: these are a special case in that they
# are four-digit numbers that our library supports, but they can
# only be dialled with a leading *. Since we don't actually store
# or detect the * in our phone number library, this means in
# practice we detect most four digit numbers as being valid for
# Israel. We are considering moving these numbers to
# ShortNumberInfo instead, in which case this problem would go
# away, but in the meantime we want to restrict the false matches
# so we only allow these numbers if they are preceded by a
# star. We enforce this for all leniency levels even though these
# numbers are technically accepted by isPossibleNumber and
# isValidNumber since we consider it to be a deficiency in those
# methods that they accept these numbers without the *.
# TODO: Remove this or make it significantly less hacky once we've
# decided how to handle these short codes going forward in
# ShortNumberInfo. We could use the formatting rules for instance,
# but that would be slower.
if (region_code_for_country_code(numobj.country_code) == "IL" and
len(national_significant_number(numobj)) == 4 and
(offset == 0 or (offset > 0 and self.text[offset - 1] != U_STAR))):
# No match.
return None
if _verify(self.leniency, numobj, candidate):
# We used parse(keep_raw_input=True) to create this number,
# but for now we don't return the extra values parsed.
# TODO: stop clearing all values here and switch all users
# over to using raw_input rather than the raw_string of
# PhoneNumberMatch.
numobj.country_code_source = CountryCodeSource.UNSPECIFIED
numobj.raw_input = None
numobj.preferred_domestic_carrier_code = None
return PhoneNumberMatch(offset, candidate, numobj)
except NumberParseException:
# ignore and continue
pass
return None
def has_next(self):
"""Indicates whether there is another match available"""
if self._state == PhoneNumberMatcher._NOT_READY:
self._last_match = self._find(self._search_index)
if self._last_match is None:
self._state = PhoneNumberMatcher._DONE
else:
self._search_index = self._last_match.end
self._state = PhoneNumberMatcher._READY
return (self._state == PhoneNumberMatcher._READY)
def next(self):
"""Return the next match; raises Exception if no next match available"""
# Check the state and find the next match as a side-effect if necessary.
if not self.has_next():
raise StopIteration("No next match")
# Don't retain that memory any longer than necessary.
result = self._last_match
self._last_match = None
self._state = PhoneNumberMatcher._NOT_READY
return result
def __iter__(self):
while self.has_next():
yield self.next()
class PhoneNumberMatch(UnicodeMixin):
"""The immutable match of a phone number within a piece of text.
Matches may be found using the find() method of PhoneNumberMatcher.
A match consists of the phone number (in .number) as well as the .start
and .end offsets of the corresponding subsequence of the searched
text. Use .raw_string to obtain a copy of the matched subsequence.
The following annotated example clarifies the relationship between the
searched text, the match offsets, and the parsed number:
>>> text = "Call me at +1 425 882-8080 for details."
>>> country = "US"
>>> import phonenumbers
>>> matcher = phonenumbers.PhoneNumberMatcher(text, country)
>>> matcher.has_next()
True
>>> m = matcher.next() # Find the first phone number match
>>> m.raw_string # contains the phone number as it appears in the text.
"+1 425 882-8080"
>>> (m.start, m.end) # define the range of the matched subsequence.
(11, 26)
>>> text[m.start, m.end]
"+1 425 882-8080"
>>> phonenumberutil.parse("+1 425 882-8080", "US") == m.number
True
"""
def __init__(self, start, raw_string, numobj):
if start < 0:
raise Exception("Start index not >= 0")
if raw_string is None or numobj is None:
raise Exception("Invalid argument")
# The start index into the text.
self.start = start
# The raw substring matched.
self.raw_string = raw_string
self.end = self.start + len(raw_string)
# The matched phone number.
self.number = numobj
def __eq__(self, other):
if not isinstance(other, PhoneNumberMatch):
return False
return (self.start == other.start and
self.raw_string == other.raw_string and
self.end == other.end and
self.number == other.number)
def __ne__(self, other):
return not self.__eq__(other)
def __repr__(self):
return (unicod("PhoneNumberMatch(start=%r, raw_string=%r, numobj=%r)") %
(self.start,
self.raw_string,
self.number))
def __unicode__(self):
return unicod("PhoneNumberMatch [%s,%s) %s") % (self.start, self.end, self.raw_string)
|