/usr/share/gocode/src/gopkg.in/neurosnap/sentences.v1/sentence_tokenizer.go is in golang-gopkg-neurosnap-sentences.v1-dev 1.0.6-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | package sentences
import "fmt"
// SentenceTokenizer interface is used by the Tokenize function, can be extended to correct sentence
// boundaries that punkt misses.
type SentenceTokenizer interface {
AnnotateTokens([]*Token, ...AnnotateTokens) []*Token
Tokenize(string) []*Sentence
}
// DefaultSentenceTokenizer is a sentence tokenizer which uses an unsupervised algorithm to build a model
// for abbreviation words, collocations, and words that start sentences
// and then uses that model to find sentence boundaries.
type DefaultSentenceTokenizer struct {
*Storage
WordTokenizer
PunctStrings
Annotations []AnnotateTokens
}
// NewSentenceTokenizer are the sane defaults for the sentence tokenizer
func NewSentenceTokenizer(s *Storage) *DefaultSentenceTokenizer {
lang := NewPunctStrings()
word := NewWordTokenizer(lang)
annotations := NewAnnotations(s, lang, word)
tokenizer := &DefaultSentenceTokenizer{
Storage: s,
PunctStrings: lang,
WordTokenizer: word,
Annotations: annotations,
}
return tokenizer
}
// NewTokenizer wraps around DST doing the work for customizing the tokenizer
func NewTokenizer(s *Storage, word WordTokenizer, lang PunctStrings) *DefaultSentenceTokenizer {
annotations := NewAnnotations(s, lang, word)
tokenizer := &DefaultSentenceTokenizer{
Storage: s,
PunctStrings: lang,
WordTokenizer: word,
Annotations: annotations,
}
return tokenizer
}
/*
AnnotateTokens given a set of tokens augmented with markers for line-start and
paragraph-start, returns an iterator through those tokens with full
annotation including predicted sentence breaks.
*/
func (s *DefaultSentenceTokenizer) AnnotateTokens(tokens []*Token, annotate ...AnnotateTokens) []*Token {
for _, ann := range annotate {
tokens = ann.Annotate(tokens)
}
return tokens
}
/*
AnnotatedTokens are the fully annotated word tokens. This allows for adhoc adjustments to the tokens
*/
func (s *DefaultSentenceTokenizer) AnnotatedTokens(text string) []*Token {
// Use the default word tokenizer but only grab the tokens that
// relate to a sentence ending punctuation. This means grab the word
// before and after the punctuation.
tokens := s.WordTokenizer.Tokenize(text, true)
if len(tokens) == 0 {
return nil
}
return s.AnnotateTokens(tokens, s.Annotations...)
}
/*
SentencePositions returns an array of positions instead of returning an array
of sentences.
*/
func (s *DefaultSentenceTokenizer) SentencePositions(text string) []int {
annotatedTokens := s.AnnotatedTokens(text)
positions := make([]int, 0, len(annotatedTokens))
for _, token := range annotatedTokens {
if !token.SentBreak {
continue
}
positions = append(positions, token.Position)
}
lastChar := len(text)
positions = append(positions, lastChar)
return positions
}
/*
Sentence container to hold sentences, provides the character positions
as well as the text for that sentence.
*/
type Sentence struct {
Start int `json:"start"`
End int `json:"end"`
Text string `json:"text"`
}
func (s Sentence) String() string {
return fmt.Sprintf("<Sentence [%d:%d] '%s'>", s.Start, s.End, s.Text)
}
// Tokenize splits text input into sentence tokens.
func (s *DefaultSentenceTokenizer) Tokenize(text string) []*Sentence {
annotatedTokens := s.AnnotatedTokens(text)
lastBreak := 0
sentences := make([]*Sentence, 0, len(annotatedTokens))
for _, token := range annotatedTokens {
if !token.SentBreak {
continue
}
sentence := &Sentence{lastBreak, token.Position, text[lastBreak:token.Position]}
sentences = append(sentences, sentence)
lastBreak = token.Position
}
if lastBreak != len(text) {
lastChar := len(text)
sentence := &Sentence{lastBreak, lastChar, text[lastBreak:lastChar]}
sentences = append(sentences, sentence)
}
return sentences
}
|