/usr/share/gocode/src/gopkg.in/neurosnap/sentences.v1/annotate.go is in golang-gopkg-neurosnap-sentences.v1-dev 1.0.6-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | package sentences
import (
"strings"
)
/*
AnnotateTokens is an interface used for the sentence tokenizer to add properties to
any given token during tokenization.
*/
type AnnotateTokens interface {
Annotate([]*Token) []*Token
}
/*
TypeBasedAnnotation performs the first pass of annotation, which makes decisions
based purely based on the word type of each word:
* '?', '!', and '.' are marked as sentence breaks.
* sequences of two or more periods are marked as ellipsis.
* any word ending in '.' that's a known abbreviation is marked as an abbreviation.
* any other word ending in '.' is marked as a sentence break.
Return these annotations as a tuple of three sets:
* sentbreak_toks: The indices of all sentence breaks.
* abbrev_toks: The indices of all abbreviations.
* ellipsis_toks: The indices of all ellipsis marks.
*/
type TypeBasedAnnotation struct {
*Storage
PunctStrings
TokenExistential
}
// NewTypeBasedAnnotation creates an instance of the TypeBasedAnnotation struct
func NewTypeBasedAnnotation(s *Storage, p PunctStrings, e TokenExistential) *TypeBasedAnnotation {
return &TypeBasedAnnotation{
Storage: s,
PunctStrings: p,
TokenExistential: e,
}
}
// NewAnnotations is the default AnnotateTokens struct that the tokenizer uses
func NewAnnotations(s *Storage, p PunctStrings, word WordTokenizer) []AnnotateTokens {
return []AnnotateTokens{
&TypeBasedAnnotation{s, p, word},
&TokenBasedAnnotation{s, p, word, &DefaultTokenGrouper{}, &OrthoContext{
s, p, word, word,
}},
}
}
// Annotate iterates over all tokens and applies the type annotation on them
func (a *TypeBasedAnnotation) Annotate(tokens []*Token) []*Token {
for _, augTok := range tokens {
a.typeAnnotation(augTok)
}
return tokens
}
func (a *TypeBasedAnnotation) typeAnnotation(token *Token) {
chars := []rune(token.Tok)
if a.HasSentEndChars(token) {
token.SentBreak = true
} else if a.HasPeriodFinal(token) && !strings.HasSuffix(token.Tok, "..") {
tokNoPeriod := strings.ToLower(token.Tok[:len(chars)-1])
tokNoPeriodHypen := strings.Split(tokNoPeriod, "-")
tokLastHyphEl := string(tokNoPeriodHypen[len(tokNoPeriodHypen)-1])
if a.IsAbbr(tokNoPeriod, tokLastHyphEl) {
token.Abbr = true
} else {
token.SentBreak = true
}
}
}
/*
TokenBasedAnnotation performs a token-based classification (section 4) over the given
tokens, making use of the orthographic heuristic (4.1.1), collocation
heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3).
*/
type TokenBasedAnnotation struct {
*Storage
PunctStrings
TokenParser
TokenGrouper
Ortho
}
// Annotate iterates groups tokens in pairs of two and then iterates over them to apply token annotation
func (a *TokenBasedAnnotation) Annotate(tokens []*Token) []*Token {
for _, tokPair := range a.TokenGrouper.Group(tokens) {
a.tokenAnnotation(tokPair[0], tokPair[1])
}
return tokens
}
func (a *TokenBasedAnnotation) tokenAnnotation(tokOne, tokTwo *Token) {
if tokTwo == nil {
return
}
if !a.TokenParser.HasPeriodFinal(tokOne) {
return
}
typ := a.TokenParser.TypeNoPeriod(tokOne)
nextTyp := a.TokenParser.TypeNoSentPeriod(tokTwo)
tokIsInitial := a.TokenParser.IsInitial(tokOne)
/*
[4.1.2. Collocation Heuristic] If there's a
collocation between the word before and after the
period, then label tok as an abbreviation and NOT
a sentence break. Note that collocations with
frequent sentence starters as their second word are
excluded in training.
*/
collocation := strings.Join([]string{typ, nextTyp}, ",")
if a.Collocations[collocation] != 0 {
tokOne.SentBreak = false
tokOne.Abbr = true
return
}
/*
[4.2. Token-Based Reclassification of Abbreviations] If
the token is an abbreviation or an ellipsis, then decide
whether we should *also* classify it as a sentbreak.
*/
if (tokOne.Abbr || a.TokenParser.IsEllipsis(tokOne)) && !tokIsInitial {
/*
[4.1.1. Orthographic Heuristic] Check if there's
orthogrpahic evidence about whether the next word
starts a sentence or not.
*/
isSentStarter := a.Ortho.Heuristic(tokTwo)
if isSentStarter == 1 {
tokOne.SentBreak = true
return
}
/*
[4.1.3. Frequent Sentence Starter Heruistic] If the
next word is capitalized, and is a member of the
frequent-sentence-starters list, then label tok as a
sentence break.
*/
if a.TokenParser.FirstUpper(tokTwo) && a.SentStarters[nextTyp] != 0 {
tokOne.SentBreak = true
return
}
}
/*
Sometimes there are two consecutive tokens with a lone "."
which probably means it is part of a spaced ellipsis ". . ."
so set those tokens and not sentence breaks
*/
if tokOne.Tok == "." && tokTwo.Tok == "." {
tokOne.SentBreak = false
tokTwo.SentBreak = false
return
}
/*
[4.3. Token-Based Detection of Initials and Ordinals]
Check if any initials or ordinals tokens that are marked
as sentbreaks should be reclassified as abbreviations.
*/
if tokIsInitial || typ == "##number##" {
isSentStarter := a.Ortho.Heuristic(tokTwo)
if isSentStarter == 0 {
tokOne.SentBreak = false
tokOne.Abbr = true
return
}
/*
Special heuristic for initials: if orthogrpahic
heuristc is unknown, and next word is always
capitalized, then mark as abbrev (eg: J. Bach).
*/
if isSentStarter == -1 &&
tokIsInitial &&
a.TokenParser.FirstUpper(tokTwo) &&
a.OrthoContext[nextTyp]&orthoLc == 0 {
tokOne.SentBreak = false
tokOne.Abbr = true
return
}
}
}
|