This file is indexed.

/usr/share/link-grammar/vn/4.0.regex is in link-grammar-dictionaries-all 5.3.16-2.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
 %***************************************************************************%
 %                                                                           %
 %  Copyright (C) 2005, 2006 Sampo Pyysalo, Sophie Aubin                     %
 %  Copyright (C) 2009, 2012 Linas Vepstas                                   %
 %  See file "LICENSE" for information about commercial use of this system   %
 %                                                                           %
 %***************************************************************************%

% This file contains regular expressions that are used to match
% tokens not found in the dictionary. Each regex is given a name which
% determines the disjuncts assigned when the regex matches; this name
% must be defined in the dictionary along with the appropriate disjuncts.
% Note that the order of the regular expressions matters: matches will
% be attempted in the order in which the regexs appear in this file,
% and only the first match will be used.
%
% Regex'es that are preceded by !, if they match a token, stop
% further match tries of the same regex name. Thus, they can serve
% as a kind of a negative look-ahead.

% Numbers.
% XXX, we need to add utf8 U+00A0 "no-break space"
% 
% Allows at most two colons in hour-minute-second HH:MM:SS expressions
% Allows at most two digits between colons
HMS-TIME: /^[0-9][0-9]?(:[0-9][0-9]?(:[0-9][0-9]?)?)?(AM|PM|am|pm)?$/

% e.g. 1950's leading number can be higher, for science fiction.
% Must be four digits, or possible three. Must end in s, 's ’s
DECADE-DATE: /^([1-4][0-9][0-9]|[1-9][0-9])0(s|'s|’s)$/

% Similar to above, but does not end in s. Only allows four digits.
% We process this before NUMBERS below, so that this is matched first.
YEAR-DATE: /^([1-4][0-9]{3}|[1-9][0-9]{0,2})$/

% Day-of-month names; this regex will match before the one below.
DAY-ORDINALS: /^(1st|2nd|3rd|[4-9]th|1[0-9]th|2(0th|1st|2nd|3rd|[4-9]th)|30th|31st)$/

% Ordinal numbers; everything except 1st through 13th
% is handled by regex.
ORDINALS: /^[1-9][0-9]*(0th|1st|2nd|3rd|[4-9]th)$/

% Allows any number of commas or periods
% Be careful not match the period at the end of a sentence; 
% for example: "It happened in 1942."
NUMBERS: /^[0-9,.]*[0-9]$/
% This parses signed numbers and ranges, e.g. "-5" and "5-10" and "9+/-6.5"
NUMBERS: /^[0-9.,-]*[0-9](\+\/-[0-9.,-]*[0-9])?$/
% Parses simple fractions e.g. "1/60" with no decimal points or anything fancy
FRACTION: /^[0-9]+\/[0-9]+$/
% "10(3)" exponent (used in PubMed)
NUMBERS: /^[0-9.,-]*[0-9][0-9.,-]*\([0-9:.,-]*[0-9][0-9.,-]*\)$/

% Roman numerals
% The first expr has the problem that it matches an empty string.  The
% cure for this is to use look-ahead, but neither the Gnu nor the BSD
% regex libs support look-ahead. I can't think of a better solution.
ROMAN-NUMERAL-WORDS: /^M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
% ROMAN-NUMERAL-WORDS: /^(?=(M|C|D|L|X|V|I)+)M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/
% ROMAN-NUMERAL-WORDS: /^(?=.+)M*(CM|D?C{0,3}|CD)(XC|L?X{0,3}|XL)(IX|V?I{0,3}|IV)$/

% Strings of initials. e.g "Dr. J.G.D. Smith lives on Main St."
% Make it at least two letters long, as otherwise it clobbers
% single-letter handling in the dict, which is different.
INITIALS: /^[A-Z]\.([A-Z]\.)+$/

% Greek letters with numbers
GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)\-?[0-9]+$/
PL-GREEK-LETTER-AND-NUMBER: /^(alpha|beta|gamma|delta|epsilon|zeta|eta|theta|iota|kappa|lambda|mu|nu|xi|omicron|pi|rho|sigma|tau|upsilon|phi|chi|psi|omega)s\-?[0-9]+$/

% Other proper nouns.
% We demand that these end with an alphanumeric, i.e. explicitly
% reject punctuation. We don't want this regex to "swallow" any trailing
% commas, colons, or periods/question-marks at the end of sentences.
% In addition, this must not swallow words ending in 's 'll etc.
% (... any affix, for that matter ...) and so no embedded apostrophe
CAPITALIZED-WORDS:     /^[[:upper:]][^'’]*[^[:punct:]]$/

% Hyphenated words. In the original LG morpho-guessing system that
% predated the regex-based system, hyphenated words were detected
% before ING-WORDS, S-WORDS etc., causing e.g. "cross-linked" to be
% treated as a HYPHENATED-WORD (a generic adjective/noun), and
% never a verb. To return to this ordering, move this regex just
% after the CAPITALIZED-WORDS regex.
% We also match on commas, dots, brackets: n-amino-3-azabicyclo[3.3.0]octane
% []] means "match right-bracket"
% Explicitly call out (5'|3') so that we don't all a generic match to 'll
% But something is funky about this 5'-3' business since 2' also matches ???
%  /^[[:alnum:]][][:alnum:],:.\[-]*-[][:alnum:],:.\[-]*[[:alnum:]]$/
HYPHENATED-WORDS:
  /^[[:alnum:](5'|3')][][:alnum:](5'|3'),:.\(\)\[-]*-[][:alnum:],:.\(\)\[-]*[[:alnum:]]$/

% Emoticon checks must come *after* the above, so that the above take precedence.
% See Wikipedia List_of_emoticons (also the References section).
%
% Emoticons must be entirely made of punctuation, length 2 or longer ;) 
% XXX [:punct:] is strangely broken, I have to add ;-< explicitly
% XXX: Don't use [:punct:].  Do NOT include period!!
% XXX: The problem with below is that 5. 7. 8. get recognized as emoticons,
% which then prevents splitting for list numbers.  (e.g "step 5. Do this.")
%
% Arghh. Other valid number expressions are clobbered by the emoticons.
% For example: $5 $7 8%  The quick fix is to remove the numbers.
% Other breakages: The below clobbers "Bob, who ..." because it
% matches Bob, as an emoticon.
%
% EMOTICON: /^[[:punct:];BDOpTX0578C☆ಠ●@◎~][[:punct:]<bcdDLmoOpPSTvX0358ಠっ○ 。゜✿☆*レツ◕●≧∇≦□◇@◎∩ω旦ヨ彡ミ‿◠ ̄ー~━-]+$/
% EMOTICON: /^[!"#$%&'()*+,\-/:;<=>?@[\\\]^_`{|}~;BDOpTX0578C☆ಠ●@◎~][!"#$%&'()*+,\-/:;<=>?@[\\\]^_`{|}~<bcdDLmoOpPSTvX0358ಠっ○ 。゜✿☆*レツ◕●≧∇≦□◇@◎∩ω旦ヨ彡ミ‿◠ ̄ー~━-]+$/
EMOTICON: !/^"|[[:alnum:]]+"$/
EMOTICON: /^[[:punct:];BC☆ಠ●@◎~][-!"#$%&'()+,:;<=>?@[\\^_`{|}~<cdDLmoOpPSTvXಠっ○ 。゜✿☆*レツ◕●≧∇≦□◇@◎∩ω旦ヨ彡ミ‿◠ ̄ー~━-]+$/

% Part numbers should not match words with punctuation at their end.
% Else sentences like "I saw him on January 21, 1990" have problems.
% They should contain at least one number, and should not have dashes at their
% start or end. A $ sign at the start is also too confusing.
% The current regex system and the syntax of this file are not expressive enough
% for things that should not be included. For example, we cannot prevent several
% sequential "#" or dashes. It may match a word consisting of number+units, but
% separate_word() will generate an alternative anyway.
% The second part of this regex is for NNN-NNN in sentences like
% "The plane is a 747-400".  However, such words currently match NUMBERS.
PART-NUMBER:
  /^[A-Z0-9#][A-Z0-9$\/#]*[A-Z0-9$\/#,.-]*[0-9][A-Z0-9$\/#,.-]*[A-Z0-9$\/#]+$|^[1-9][0-9]+[\/-][0-9+]$/

% Sequence of punctuation marks. If some mark appears in the affix table
% such as a period, comma, dash or underscore, and there's a sequence of
% these, then treat it as a "fill-in-the-blank" placeholder.
% This matters only for punc. appearing in the affix table, since the
% tokenizer explicitly mangles based on these punctuation marks.
% 
% Look for at least four in a row.
UNKNOWN-WORD: /^[.,-]{4}[.,-]*$/