/usr/share/namazu/pl/conf.pl is in namazu2-index-tools 2.0.21-21.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | #
# This is a Namazu configuration file for mknmz.
#
package conf; # Don't remove this line!
#===================================================================
#
# Administrator's email address
#
$ADDRESS = 'webmaster@lgw01-47.openstacklocal';
#===================================================================
#
# Regular Expression Patterns
#
#
# This pattern specifies HTML suffixes.
#
$HTML_SUFFIX = "html?|[ps]html|html\\.[a-z]{2}";
#
# This pattern specifies file names which will be targeted.
# NOTE: It can be specified by --allow=regex option.
# Do NOT use `$' or `^' anchors.
# Case-insensitive.
#
$ALLOW_FILE = ".*\\.(?:$HTML_SUFFIX)|.*\\.txt" . # HTML, plain text
"|.*\\.gz|.*\\.Z|.*\\.bz2" . # Compressed files
"|.*\\.pdf|.*\\.ps" . # PDF, PostScript
"|.*\\.tex|.*\\.dvi" . # TeX, DVI
"|.*\\.rpm|.*\\.deb" . # RPM, DEB
"|.*\\.doc|.*\\.xls|.*\\.pp[st]" . # Word, Excel, PowerPoint
"|.*\\.docx|.*\\.xlsx|.*\\.pp[st]x" . # MS-OfficeOpenXML Word, Excel, PowerPoint
"|.*\\.vs[dst]|.*\\.v[dst]x" . # Visio
"|.*\\.j[sabf]w|.*\\.jtd" . # Ichitaro 4, 5, 6, 7, 8
"|.*\\.sx[widc]" . # OpenOffice Writer,Calc,Impress,Draw
"|.*\\.od[tspg]" . # OpenOffice2.0
"|.*\\.rtf" . # Rich Text Format
"|.*\\.hdml|.*\\.mht" . # HDML MHTML
"|.*\\.mp3" . # MP3
"|.*\\.gnumeric" . # Gnumeric
"|.*\\.kwd|.*\\.ksp" . # KWord, KSpread
"|.*\\.kpr|.*\\.flw" . # KPresenter, Kivio
"|.*\\.eml|\\d+|[-\\w]+\\.[1-9n]"; # Mail/News, man
#
# This pattern specifies file names which will NOT be targeted.
# NOTE: It can be specified by --deny=regex option.
# Do NOT use `$' or `^' anchors.
# Case-insensitive.
#
$DENY_FILE = ".*\\.(gif|png|jpg|jpeg)|.*\\.tar\\.gz|core|.*\\.bak|.*~|\\..*|\x23.*";
#
# This pattern specifies DDN(DOS Device Name) which will NOT be targeted.
# NOTE: Only for Windows.
# Do NOT use `$' or `^' anchors.
# Case-insensitive.
#
$DENY_DDN = "con|aux|nul|prn|lpt[1-9]|com[1-9][0-9]?|clock\$|xmsxxxx0";
#
# This pattern specifies PATHNAMEs which will NOT be targeted.
# NOTE: Usually specified by --exclude=regex option.
#
$EXCLUDE_PATH = undef;
#
# This pattern specifies file names which can be omitted
# in URI. e.g., 'index.html|index.htm|Default.html'
#
# NOTE: This is similar to Apache's "DirectoryIndex" directive.
#
$DIRECTORY_INDEX = "";
#
# This pattern specifies Mail/News's fields in its header which
# should be searchable. NOTE: case-insensitive
#
$REMAIN_HEADER = "From|Date|Message-ID";
#
# This pattern specifies fields which used for field-specified
# searching. NOTE: case-insensitive
#
$SEARCH_FIELD = "message-id|subject|from|date|uri|newsgroups|to|summary|size";
#
# This pattern specifies meta tags which used for field-specified
# searching. NOTE: case-insensitive
#
$META_TAGS = "keywords|description";
#
# This pattern specifies aliases for NMZ.field.* files.
# NOTE: Editing NOT recommended.
#
%FIELD_ALIASES = ('title' => 'subject', 'author' => 'from');
#
# This pattern specifies HTML elements which should be replaced with
# null string when removing them. Normally, the elements are replaced
# with a single space character.
#
$NON_SEPARATION_ELEMENTS = 'A|TT|CODE|SAMP|KBD|VAR|B|STRONG|I|EM|CITE|FONT|U|'.
'STRIKE|BIG|SMALL|DFN|ABBR|ACRONYM|Q|SUB|SUP|SPAN|BDO';
#
# This pattern specifies attribute of a HTML tag which should be
# searchable.
#
$HTML_ATTRIBUTES = 'ALT|SUMMARY|TITLE';
#===================================================================
#
# Critical Numbers
#
#
# The max size of files which can be loaded in memory at once.
# If you have much memory, you can increase the value.
# If you have less memory, you can decrease the value.
#
$ON_MEMORY_MAX = 5000000;
#
# The max file size for indexing. Files larger than this
# will be ignored.
# NOTE: This value is usually larger than TEXT_SIZE_MAX because
# binary-formated files such as PDF, Word are larger.
#
$FILE_SIZE_MAX = 2000000;
#
# The max text size for indexing. Files larger than this
# will be ignored.
#
$TEXT_SIZE_MAX = 600000;
#
# The max length of a word. the word longer than this will be ignored.
#
$WORD_LENG_MAX = 128;
#
# Weights for HTML elements which are used for term weightning.
#
%Weight =
(
'html' => {
'title' => 16,
'h1' => 8,
'h2' => 7,
'h3' => 6,
'h4' => 5,
'h5' => 4,
'h6' => 3,
'a' => 4,
'strong' => 2,
'em' => 2,
'kbd' => 2,
'samp' => 2,
'var' => 2,
'code' => 2,
'cite' => 2,
'abbr' => 2,
'acronym'=> 2,
'dfn' => 2,
},
'metakey' => 32, # for <meta name="keywords" content="foo bar">
'headers' => 8, # for Mail/News' headers
);
#
# The max length of a HTML-tagged string which can be processed for
# term weighting.
# NOTE: There are not a few people has a bad manner using
# <h[1-6]> for changing a font size.
#
$INVALID_LENG = 128;
#
# The max length of a field.
# This MUST be smaller than libnamazu.h's BUFSIZE (usually 1024).
#
$MAX_FIELD_LENGTH = 200;
#===================================================================
#
# Softwares for handling a Japanese text
#
#
# Network Kanji Filter nkf v1.71 or later
#
$NKF = "module_nkf";
#
# KAKASI 2.x or later
# Text::Kakasi 1.05 or later
#
$KAKASI = "module_kakasi -ieuc -oeuc -w";
#
# ChaSen 2.02 or later (simple wakatigaki)
# Text::ChaSen 1.03
#
$CHASEN = "/usr/bin/chasen -i e -j -F \"\%m \"";
#
# ChaSen 2.02 or later (with noun words extraction)
#
$CHASEN_NOUN = "/usr/bin/chasen -i e -j -F \"\%m %H\\n\"";
#
# MeCab
#
$MECAB = "no";
#
# Default Japanese processer: KAKASI or ChaSen or MeCab.
#
$WAKATI = $KAKASI;
#===================================================================
#
# Directories
#
# $LIBDIR = "@PERLLIBDIR@";
# $FILTERDIR = "@FILTERDIR@";
# $TEMPLATEDIR = "@TEMPLATEDIR@";
#
#[WIN] $LIBDIR = 'C:/namazu/share/namazu/pl';
#[WIN] $FILTERDIR = 'C:/namazu/share/namazu/filter';
#[WIN] $TEMPLATEDIR = 'C:/namazu/share/namazu/template';
1;
|