This file is indexed.

/usr/share/doc/sunpinyin/SLM-train.mk is in sunpinyin-utils 2.0.3+git20140127-4.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/make -f
# -*- mode: makefile; indent-tabs-mode: t -*- vim:noet:ts=4
# Sample Makefile for SLM training.

DICT_FILE = dict.utf8
CORPUS_FILE = corpus.utf8

SLM_TARGET = lm_sc
IDS_FILE = ${SLM_TARGET}.ids

BIGRAM_STAT_FILE = ${SLM_TARGET}.id.2gm
SLM2_RAW_FILE = ${SLM_TARGET}.2gm.raw
SLM2_FILE = ${SLM_TARGET}.2gm
SLM2_TEXT_FILE = ${SLM_TARGET}.2gm.arpa
TSLM2_FILE = ${SLM_TARGET}.t2g
TSLM2_TEXT_FILE = ${SLM_TARGET}.t2g.arpa

TRIGRAM_STAT_FILE = ${SLM_TARGET}.id.3gm
SLM3_RAW_FILE = ${SLM_TARGET}.3gm.raw
SLM3_FILE = ${SLM_TARGET}.3gm 
SLM3_TEXT_FILE = ${SLM_TARGET}.3gm.arpa 
TSLM3_FILE = ${SLM_TARGET}.t3g
TSLM3_TEXT_FILE = ${SLM_TARGET}.t3g.arpa

PYTRIE_FILE = pydict_sc.bin
PYTRIE_LOG_FILE = pydict_sc.log

all: bootstrap3

mmseg_ids: ${DICT_FILE} ${CORPUS_FILE}
	mmseg -f bin -s 10 -a 9 -d ${DICT_FILE} ${CORPUS_FILE} > ${IDS_FILE}

slm2_ids: ${DICT_FILE} ${CORPUS_FILE} ${TSLM2_FILE}
	slmseg -f bin -s 10 \
		-d ${DICT_FILE} -m ${TSLM2_FILE} ${CORPUS_FILE} > ${IDS_FILE}

slm3_ids: ${DICT_FILE} ${CORPUS_FILE} ${TSLM3_FILE}
	slmseg -f bin -s 10 \
		-d ${DICT_FILE} -m ${TSLM3_FILE} ${CORPUS_FILE} > ${IDS_FILE}

bigram_stat: ${BIGRAM_STAT_FILE}
${BIGRAM_STAT_FILE}: ${IDS_FILE}
	ids2ngram -n 2 -p 20000000 -s $@.tmp -o $@ $<
	rm -f $@.tmp

slm2_raw: ${SLM2_RAW_FILE}
${SLM2_RAW_FILE}: ${BIGRAM_STAT_FILE}
	slmbuild -n 2 -w 200000 -c 0,2 -d ABS,0.005 -d ABS -b 10 -e 9 -o $@ $<

slm2: ${SLM2_FILE}
${SLM2_FILE}: ${SLM2_RAW_FILE}
	slmprune $< $@ R 100000 200000

tslm2: ${TSLM2_FILE}
${TSLM2_FILE}:${SLM2_FILE}
	slmthread $< $@

slm2_info: ${SLM2_TEXT_FILE}
${SLM2_TEXT_FILE}: ${DICT_FILE} ${SLM2_FILE}
	slminfo -p -v -l ${DICT_FILE} ${SLM2_FILE} > $@

tslm2_info: ${TSLM2_TEXT_FILE}
${TSLM2_TEXT_FILE}: ${DICT_FILE} ${TSLM2_FILE}
	tslminfo -v -l ${DICT_FILE} ${TSLM2_FILE} > $@

lexicon2: ${DICT_FILE} ${TSLM2_FILE}
	genpyt -i ${DICT_FILE} -s ${TSLM2_FILE} \
		-l ${PYTRIE_LOG_FILE} -o ${PYTRIE_FILE}

trigram_stat: ${TRIGRAM_STAT_FILE}
${TRIGRAM_STAT_FILE}: ${IDS_FILE}
	ids2ngram -n 3 -p 20000000 -s $@.tmp -o $@ $<
	rm -f $@.tmp

slm3_raw: ${SLM3_RAW_FILE}
${SLM3_RAW_FILE}: ${TRIGRAM_STAT_FILE}
	slmbuild -n 3 -w 200000 -c 0,2,2 -d ABS,0.0005 -d ABS -d ABS -b 10 -e 9 \
		-o $@ $<
slm3: ${SLM3_FILE}
${SLM3_FILE}: ${SLM3_RAW_FILE}
	slmprune $< $@ R 100000 2500000 1000000

tslm3: ${TSLM3_FILE}
${TSLM3_FILE}: ${SLM3_FILE}
	slmthread $< $@

slm3_info: ${SLM3_TEXT_FILE}
${SLM3_TEXT_FILE}: ${DICT_FILE} ${SLM3_FILE}
	slminfo -p -v -l ${DICT_FILE} ${SLM3_FILE} > $@

tslm3_info: ${TSLM3_TEXT_FILE}
${TSLM3_TEXT_FILE}: ${DICT_FILE} ${TSLM3_FILE}
	tslminfo -p -v -l ${DICT_FILE} ${TSLM3_FILE} > $@

lexicon3: ${DICT_FILE} ${TSLM3_FILE}
	genpyt -i ${DICT_FILE} -s ${TSLM3_FILE} \
		-l ${PYTRIE_LOG_FILE} -o ${PYTRIE_FILE}

tmp_clean:
	rm -f *.tmp ${IDS_FILE}
	rm -f ${BIGRAM_STAT} ${SLM2_RAW_FILE}
	rm -f ${TRIGRAM_STAT} ${SLM3_RAW_FILE}

mmseg_bigram: mmseg_ids tslm2_info
mmseg_trigram: mmseg_ids tslm3_info
slm_bigram: slm2_ids tslm2_info
slm_trigram: slm3_ids tslm3_info

bootstrap2:
	make mmseg_bigram
	make slm_bigram
	make slm_bigram
	make lexicon2

bootstrap3:
	make mmseg_trigram
	make slm_trigram
	make slm_trigram
	make lexicon3