/usr/share/link-grammar/vn/4.0.regex

 %***************************************************************************%
 %                                                                           %
 %  Copyright (C) 2005, 2006 Sampo Pyysalo, Sophie Aubin                     %
 %  Copyright (C) 2009, 2012 Linas Vepstas                                   %
 %  See file "LICENSE" for information about commercial use of this system   %
 %                                                                           %
 %***************************************************************************%

% This file contains regular expressions that are used to match
% tokens not found in the dictionary. Each regex is given a name which
% determines the disjuncts assigned when the regex matches; this name
% must be defined in the dictionary along with the appropriate disjuncts.
% Note that the order of the regular expressions matters: matches will
% be attempted in the order in which the regexs appear in this file,
% and only the first match will be used.
%
% Regex'es that are preceded by !, if they match a token, stop
% further match tries of the same regex name. Thus, they can serve
% as a kind of a negative look-ahead.

% Numbers.
% XXX, we need to add utf8 U+00A0 "no-break space"
% 
% Allows at most two colons in hour-minute-second HH:MM:SS expressions
% Allows at most two digits between colons
HMS-TIME: /^[0-9][0-9]?(:[0-9][0-9]?(:[0-9][0-9]?)?)?(AM|PM|am|pm)?$/

% e.g. 1950's leading number can be higher, for science fiction.
% Must be four digits, or possible three. Must end in s, 's ’s
DECADE-DATE: /^([1-4][0-9][0-9]|[1-9][0-9])0(s|'s|’s)$/

% Similar to above, but does not end in s. Only allows four digits.
% We process this before NUMBERS below, so that this is matched first.
YEAR-DATE: /^([1-4][0-9]{3}|[1-9][0-9]{0,2})$/

% Day-of-month names; this regex will match before the one below.
DAY-ORDINALS: /^(1st|2nd|3rd|[4-9]th|1[0-9]th|2(0th|1st|2nd|3rd|[4-9]th)|30th|31st)$/

% Ordinal numbers; everything except 1st through 13th
% is handled by regex.
ORDINALS: /^[1-9][0-9]*(0th|1st|2nd|3rd|[4-9]th)$/

% Allows any number of commas or periods
% Be careful not match the period at the end of a sentence; 
% for example: "It happened in 1942."
NUMBERS: /^[0-9,.]*[0-9]$/
% This parses signed numbers and ranges, e.g. "-5" and "5-10" and "9+/-6.5"
NUMBERS: /^[0-9.,-]*[0-9](\+\/-[0-9.,-]*[0-9])?$/
% Parses simple fractions e.g. "1/60" with no decimal points or anything fancy
FRACTION: /^[0-9]+\/[0-9]+$/
% "10(3)" exponent (used in PubMed)
NUMBERS: /^[0-9.,-]*[0-9][0-9.,-]*\([0-9:.,-]*[0-9][0-9.,-]*\)$/

% Roman numerals
% The first expr has the problem that it matches an empty string.  The
% cure for this is to use look-ahead, but neither the Gnu nor the BSD
% regex libs support look-ahead. I can't think of a better solution.
ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
% ROMAN-NUMERAL-WORDS: /^(?=(M|C|D|L|X|V|I)+)M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
% ROMAN-NUMERAL-WORDS: /^(?=.+)M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/

% Strings of initials. e.g "Dr. J.G.D. Smith lives on Main St."
% Make it at least two letters long, as otherwise it clobbers
% single-letter handling in the dict, which is different.
INITIALS: /^[A-Z]\.([A-Z]\.)+$/

% Greek letters with numbers
GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\-?[0-9]+$/
PL-GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)s\-?[0-9]+$/

% Other proper nouns.
% We demand that these end with an alphanumeric, i.e. explicitly
% reject punctuation. We don't want this regex to "swallow" any trailing
% commas, colons, or periods/question-marks at the end of sentences.
% In addition, this must not swallow words ending in 's 'll etc.
% (... any affix, for that matter ...) and so no embedded apostrophe
CAPITALIZED-WORDS:     /^[[:upper:]][^'’]*[^[:punct:]]$/

% Hyphenated words. In the original LG morpho-guessing system that
% predated the regex-based system, hyphenated words were detected
% before ING-WORDS, S-WORDS etc., causing e.g. "cross-linked" to be
% treated as a HYPHENATED-WORD (a generic adjective/noun), and
% never a verb. To return to this ordering, move this regex just
% after the CAPITALIZED-WORDS regex.
% We also match on commas, dots, brackets: n-amino-3-azabicyclo[3.3.0]octane
% []] means "match right-bracket"
% Explicitly call out (5'|3') so that we don't all a generic match to 'll
% But something is funky about this 5'-3' business since 2' also matches ???
%  /^[[:alnum:]][][:alnum:],:.\[-]*-[][:alnum:],:.\[-]*[[:alnum:]]$/
HYPHENATED-WORDS:
  /^[[:alnum:](5'|3')][][:alnum:](5'|3'),:.\(\)\[-]*-[][:alnum:],:.\(\)\[-]*[[:alnum:]]$/

% Emoticon checks must come *after* the above, so that the above take precedence.
% See Wikipedia List_of_emoticons (also the References section).
%
% Emoticons must be entirely made of punctuation, length 2 or longer ;) 
% XXX [:punct:] is strangely broken, I have to add ;-< explicitly
% XXX: Don't use [:punct:].  Do NOT include period!!
% XXX: The problem with below is that 5. 7. 8. get recognized as emoticons,
% which then prevents splitting for list numbers.  (e.g "step 5. Do this.")
%
% Arghh. Other valid number expressions are clobbered by the emoticons.
% For example: $5 $7 8%  The quick fix is to remove the numbers.
% Other breakages: The below clobbers "Bob, who ..." because it
% matches Bob, as an emoticon.
%
% EMOTICON: /^[[:punct:];BDOpTX0578Ｃ☆ಠ●＠◎～][[:punct:]<bcdDLmoOpPSTvX0358ಠっ○ 。゜✿☆＊レツ◕●≧∇≦□◇＠◎∩ω旦ヨ彡ミ‿◠￣ー～━-]+$/
% EMOTICON: /^[!"#$%&'()*+,\-/:;<=>?@[\\\]^_`{|}~;BDOpTX0578Ｃ☆ಠ●＠◎～][!"#$%&'()*+,\-/:;<=>?@[\\\]^_`{|}~<bcdDLmoOpPSTvX0358ಠっ○ 。゜✿☆＊レツ◕●≧∇≦□◇＠◎∩ω旦ヨ彡ミ‿◠￣ー～━-]+$/
EMOTICON: !/^"|[[:alnum:]]+"$/
EMOTICON: /^[[:punct:];BＣ☆ಠ●＠◎～][-!"#$%&'()+,:;<=>?@[\\^_`{|}~<cdDLmoOpPSTvXಠっ○ 。゜✿☆＊レツ◕●≧∇≦□◇＠◎∩ω旦ヨ彡ミ‿◠￣ー～━-]+$/

% Part numbers should not match words with punctuation at their end.
% Else sentences like "I saw him on January 21, 1990" have problems.
% They should contain at least one number, and should not have dashes at their
% start or end. A $ sign at the start is also too confusing.
% The current regex system and the syntax of this file are not expressive enough
% for things that should not be included. For example, we cannot prevent several
% sequential "#" or dashes. It may match a word consisting of number+units, but
% separate_word() will generate an alternative anyway.
% The second part of this regex is for NNN-NNN in sentences like
% "The plane is a 747-400".  However, such words currently match NUMBERS.
PART-NUMBER:
  /^[A-Z0-9#][A-Z0-9$\/#]*[A-Z0-9$\/#,.-]*[0-9][A-Z0-9$\/#,.-]*[A-Z0-9$\/#]+$|^[1-9][0-9]+[\/-][0-9+]$/

% Sequence of punctuation marks. If some mark appears in the affix table
% such as a period, comma, dash or underscore, and there's a sequence of
% these, then treat it as a "fill-in-the-blank" placeholder.
% This matters only for punc. appearing in the affix table, since the
% tokenizer explicitly mangles based on these punctuation marks.
% 
% Look for at least four in a row.
UNKNOWN-WORD: /^[.,-]{4}[.,-]*$/
link-grammar-dictionaries-all 5.3.16-2 / usr / share / link-grammar / vn / 4.0.regex