This file is indexed.

/usr/share/doc/ocrmypdf/html/batch.html is in ocrmypdf-doc 6.1.2-1ubuntu1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
<!DOCTYPE html>
<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
<head>
  <meta charset="utf-8">
  
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  
  <title>Batch processing &mdash; ocrmypdf 6.1.2 documentation</title>
  

  
  
  
  

  

  
  
    

  

  
  
    <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
  

  

  
        <link rel="index" title="Index"
              href="genindex.html"/>
        <link rel="search" title="Search" href="search.html"/>
    <link rel="top" title="ocrmypdf 6.1.2 documentation" href="index.html"/>
        <link rel="next" title="PDF security issues" href="security.html"/>
        <link rel="prev" title="Advanced features" href="advanced.html"/> 

  
  <script src="_static/js/modernizr.min.js"></script>

</head>

<body class="wy-body-for-nav" role="document">

   
  <div class="wy-grid-for-nav">

    
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search">
          

          
            <a href="index.html" class="icon icon-home"> ocrmypdf
          

          
          </a>

          

          
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>

          
        </div>

        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
          
            
            
              
            
            
              <ul>
<li class="toctree-l1"><a class="reference internal" href="introduction.html">Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="release_notes.html">Release notes</a></li>
<li class="toctree-l1"><a class="reference internal" href="languages.html">Installing additional language packs</a></li>
</ul>
<p class="caption"><span class="caption-text">Usage</span></p>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="cookbook.html">Cookbook</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced.html">Advanced features</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Batch processing</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#batch-jobs">Batch jobs</a></li>
<li class="toctree-l2"><a class="reference internal" href="#directory-trees">Directory trees</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#sample-script">Sample script</a></li>
<li class="toctree-l3"><a class="reference internal" href="#api">API</a></li>
<li class="toctree-l3"><a class="reference internal" href="#synology-diskstations">Synology DiskStations</a></li>
<li class="toctree-l3"><a class="reference internal" href="#huge-batch-jobs">Huge batch jobs</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#hot-watched-folders">Hot (watched) folders</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#caveats">Caveats</a></li>
<li class="toctree-l3"><a class="reference internal" href="#alternatives">Alternatives</a></li>
</ul>
</li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="security.html">PDF security issues</a></li>
<li class="toctree-l1"><a class="reference internal" href="errors.html">Common error messages</a></li>
</ul>

            
          
        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">

      
      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
        
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="index.html">ocrmypdf</a>
        
      </nav>


      
      <div class="wy-nav-content">
        <div class="rst-content">
          















<div role="navigation" aria-label="breadcrumbs navigation">

  <ul class="wy-breadcrumbs">
    
      <li><a href="index.html">Docs</a> &raquo;</li>
        
      <li>Batch processing</li>
    
    
      <li class="wy-breadcrumbs-aside">
        
            
            <a href="_sources/batch.rst.txt" rel="nofollow"> View page source</a>
          
        
      </li>
    
  </ul>

  
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">
            
  <div class="section" id="batch-processing">
<h1>Batch processing<a class="headerlink" href="#batch-processing" title="Permalink to this headline"></a></h1>
<p>This article provides information about running OCRmyPDF on multiple files or configuring it as a service triggered by file system events.</p>
<div class="section" id="batch-jobs">
<h2>Batch jobs<a class="headerlink" href="#batch-jobs" title="Permalink to this headline"></a></h2>
<p>Consider using the excellent <a class="reference external" href="https://www.gnu.org/software/parallel/">GNU Parallel</a> to apply OCRmyPDF to multiple files at once.</p>
<p>Both <code class="docutils literal"><span class="pre">parallel</span></code> and <code class="docutils literal"><span class="pre">ocrmypdf</span></code> will try to use all available processors. To maximize parallelism without overloading your system with processes, consider using <code class="docutils literal"><span class="pre">parallel</span> <span class="pre">-j</span> <span class="pre">2</span></code> to limit parallel to running two jobs at once.</p>
<p>This command will run all ocrmypdf all files named <code class="docutils literal"><span class="pre">*.pdf</span></code> in the current directory and write them to the previous created <code class="docutils literal"><span class="pre">output/</span></code> folder. It will not search subdirectories.</p>
<p>The <code class="docutils literal"><span class="pre">--tag</span></code> argument tells parallel to print the filename as a prefix whenever a message is printed, so that one can trace any errors to the file that produced them.</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span>parallel --tag -j <span class="m">2</span> ocrmypdf <span class="s1">&#39;{}&#39;</span> <span class="s1">&#39;output/{}&#39;</span> ::: *.pdf
</pre></div>
</div>
<p>OCRmyPDF automaticaly repairs PDFs before parsing and gathering information from them.  If you are already repairing PDFs with <code class="docutils literal"><span class="pre">qpdf</span></code> prior to attempting OCR, or you can use <code class="docutils literal"><span class="pre">--skip-repair</span></code> to skip this step.  It may improve performance for large files, since repairing PDFs is single-threaded.</p>
</div>
<div class="section" id="directory-trees">
<h2>Directory trees<a class="headerlink" href="#directory-trees" title="Permalink to this headline"></a></h2>
<p>This will walk through a directory tree and run OCR on all files in place, printing the output in a way that makes</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span>find . --printf <span class="s1">&#39;%p&#39;</span> -name <span class="s1">&#39;*.pdf&#39;</span> -exec ocrmypdf <span class="s1">&#39;{}&#39;</span> <span class="s1">&#39;{}&#39;</span> <span class="se">\;</span>
</pre></div>
</div>
<p>This only runs one <code class="docutils literal"><span class="pre">ocrmypdf</span></code> process at a time. This variation uses <code class="docutils literal"><span class="pre">find</span></code> to create a directory list and <code class="docutils literal"><span class="pre">parallel</span></code> to parallelize runs of <code class="docutils literal"><span class="pre">ocrmypdf</span></code>, again updating files in place.</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span>find . -name <span class="s1">&#39;*.pdf&#39;</span> <span class="p">|</span> parallel --tag -j <span class="m">2</span> ocrmypdf <span class="s1">&#39;{}&#39;</span> <span class="s1">&#39;{}&#39;</span>
</pre></div>
</div>
<div class="section" id="sample-script">
<h3>Sample script<a class="headerlink" href="#sample-script" title="Permalink to this headline"></a></h3>
<p>This user contributed script also provides an example of batch processing.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="ch">#!/usr/bin/env python3</span>
<span class="c1"># Walk through directory tree, replacing all files with OCR&#39;d version</span>
<span class="c1"># Contributed by DeliciousPickle@github</span>

<span class="kn">import</span> <span class="nn">logging</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">subprocess</span>
<span class="kn">import</span> <span class="nn">sys</span>

<span class="n">script_dir</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">dirname</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">realpath</span><span class="p">(</span><span class="vm">__file__</span><span class="p">))</span>
<span class="k">print</span><span class="p">(</span><span class="n">script_dir</span> <span class="o">+</span> <span class="s1">&#39;/ocr-tree.py: Start&#39;</span><span class="p">)</span>

<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
    <span class="n">start_dir</span> <span class="o">=</span> <span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
    <span class="n">start_dir</span> <span class="o">=</span> <span class="s1">&#39;.&#39;</span>

<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">2</span><span class="p">:</span>
    <span class="n">log_file</span> <span class="o">=</span> <span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
    <span class="n">log_file</span> <span class="o">=</span> <span class="n">script_dir</span> <span class="o">+</span> <span class="s1">&#39;/ocr-tree.log&#39;</span>

<span class="n">logging</span><span class="o">.</span><span class="n">basicConfig</span><span class="p">(</span>
                <span class="n">level</span><span class="o">=</span><span class="n">logging</span><span class="o">.</span><span class="n">INFO</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s1">&#39;</span><span class="si">%(asctime)s</span><span class="s1"> </span><span class="si">%(message)s</span><span class="s1">&#39;</span><span class="p">,</span>
                <span class="n">filename</span><span class="o">=</span><span class="n">log_file</span><span class="p">,</span> <span class="n">filemode</span><span class="o">=</span><span class="s1">&#39;w&#39;</span><span class="p">)</span>

<span class="k">for</span> <span class="n">dir_name</span><span class="p">,</span> <span class="n">subdirs</span><span class="p">,</span> <span class="n">file_list</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">walk</span><span class="p">(</span><span class="n">start_dir</span><span class="p">):</span>
    <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">)</span>
    <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="n">dir_name</span> <span class="o">+</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">)</span>
    <span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">dir_name</span><span class="p">)</span>
    <span class="k">for</span> <span class="n">filename</span> <span class="ow">in</span> <span class="n">file_list</span><span class="p">:</span>
        <span class="n">file_ext</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">splitext</span><span class="p">(</span><span class="n">filename</span><span class="p">)[</span><span class="mi">1</span><span class="p">]</span>
        <span class="k">if</span> <span class="n">file_ext</span> <span class="o">==</span> <span class="s1">&#39;.pdf&#39;</span><span class="p">:</span>
            <span class="n">full_path</span> <span class="o">=</span> <span class="n">dir_name</span> <span class="o">+</span> <span class="s1">&#39;/&#39;</span> <span class="o">+</span> <span class="n">filename</span>
            <span class="k">print</span><span class="p">(</span><span class="n">full_path</span><span class="p">)</span>
            <span class="n">cmd</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;ocrmypdf&quot;</span><span class="p">,</span>  <span class="s2">&quot;--deskew&quot;</span><span class="p">,</span> <span class="n">filename</span><span class="p">,</span> <span class="n">filename</span><span class="p">]</span>
            <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="n">cmd</span><span class="p">)</span>
            <span class="n">proc</span> <span class="o">=</span> <span class="n">subprocess</span><span class="o">.</span><span class="n">Popen</span><span class="p">(</span>
                <span class="n">cmd</span><span class="p">,</span> <span class="n">stdout</span><span class="o">=</span><span class="n">subprocess</span><span class="o">.</span><span class="n">PIPE</span><span class="p">,</span> <span class="n">stderr</span><span class="o">=</span><span class="n">subprocess</span><span class="o">.</span><span class="n">STDOUT</span><span class="p">)</span>
            <span class="n">result</span> <span class="o">=</span> <span class="n">proc</span><span class="o">.</span><span class="n">stdout</span><span class="o">.</span><span class="n">read</span><span class="p">()</span>
            <span class="k">if</span> <span class="n">proc</span><span class="o">.</span><span class="n">returncode</span> <span class="o">==</span> <span class="mi">6</span><span class="p">:</span>
                <span class="k">print</span><span class="p">(</span><span class="s2">&quot;Skipped document because it already contained text&quot;</span><span class="p">)</span>
            <span class="k">elif</span> <span class="n">proc</span><span class="o">.</span><span class="n">returncode</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
                <span class="k">print</span><span class="p">(</span><span class="s2">&quot;OCR complete&quot;</span><span class="p">)</span>
            <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="section" id="api">
<h3>API<a class="headerlink" href="#api" title="Permalink to this headline"></a></h3>
<p>OCRmyPDF is currently supported as a command line interface. This means that even if you are using OCRmyPDF in a Python script, you should run it in a subprocess rather importing the ocrmypdf package.</p>
<p>The reason for this limitation is that the <a class="reference external" href="https://github.com/bunbun/ruffus/">ruffus</a> library that OCRmyPDF depends on is unfortunately not reentrant. OCRmyPDF works by defining each operation it does as a ruffus task that takes one or more files as input and generates one or more files as output. As such ruffus is fairly fundamental.</p>
<p>(If you find individual functions implemented in OCRmyPDF useful (such as <code class="docutils literal"><span class="pre">ocrmypdf.pdfinfo</span></code>), you can use these if you wish to.)</p>
</div>
<div class="section" id="synology-diskstations">
<h3>Synology DiskStations<a class="headerlink" href="#synology-diskstations" title="Permalink to this headline"></a></h3>
<p>Synology DiskStations (Network Attached Storage devices) can run the Docker image of OCRmyPDF if the Synology <a class="reference external" href="https://www.synology.com/en-global/dsm/packages/Docker">Docker package</a> is installed. Attached is a script to address particular quirks of using OCRmyPDF on one of these devices.</p>
<p>This is only possible for x86-based Synology products. Some Synology products use ARM or Power processors and do not support Docker. Further adjustments might be needed to deal with the Synology’s relatively limited CPU and RAM.</p>
<div class="highlight-python"><div class="highlight"><pre><span></span><span class="ch">#!/bin/env python3</span>
<span class="c1"># Contributed by github.com/Enantiomerie</span>

<span class="c1"># script needs 2 arguments</span>
<span class="c1"># 1. source dir with *.pdf - default is location of script</span>
<span class="c1"># 2. move dir where *.pdf and *_OCR.pdf are moved to</span>

<span class="kn">import</span> <span class="nn">logging</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">subprocess</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">time</span>
<span class="kn">import</span> <span class="nn">shutil</span>

<span class="n">script_dir</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">dirname</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">realpath</span><span class="p">(</span><span class="vm">__file__</span><span class="p">))</span>
<span class="n">timestamp</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y-%m-</span><span class="si">%d</span><span class="s2">-%H%M_&quot;</span><span class="p">)</span>
<span class="n">log_file</span> <span class="o">=</span> <span class="n">script_dir</span> <span class="o">+</span> <span class="s1">&#39;/&#39;</span> <span class="o">+</span> <span class="n">timestamp</span> <span class="o">+</span> <span class="s1">&#39;ocrmypdf.log&#39;</span>
<span class="n">logging</span><span class="o">.</span><span class="n">basicConfig</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">logging</span><span class="o">.</span><span class="n">INFO</span><span class="p">,</span> <span class="n">format</span><span class="o">=</span><span class="s1">&#39;</span><span class="si">%(asctime)s</span><span class="s1"> </span><span class="si">%(message)s</span><span class="s1">&#39;</span><span class="p">,</span> <span class="n">filename</span><span class="o">=</span><span class="n">log_file</span><span class="p">,</span> <span class="n">filemode</span><span class="o">=</span><span class="s1">&#39;w&#39;</span><span class="p">)</span>

<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
    <span class="n">start_dir</span> <span class="o">=</span> <span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
    <span class="n">start_dir</span> <span class="o">=</span> <span class="s1">&#39;.&#39;</span>

<span class="k">for</span> <span class="n">dir_name</span><span class="p">,</span> <span class="n">subdirs</span><span class="p">,</span> <span class="n">file_list</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">walk</span><span class="p">(</span><span class="n">start_dir</span><span class="p">):</span>
    <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">)</span>
    <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="n">dir_name</span> <span class="o">+</span> <span class="s1">&#39;</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">)</span>
    <span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">dir_name</span><span class="p">)</span>
    <span class="k">for</span> <span class="n">filename</span> <span class="ow">in</span> <span class="n">file_list</span><span class="p">:</span>
        <span class="n">file_ext</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">splitext</span><span class="p">(</span><span class="n">filename</span><span class="p">)[</span><span class="mi">1</span><span class="p">]</span>
        <span class="k">if</span> <span class="n">file_ext</span> <span class="o">==</span> <span class="s1">&#39;.pdf&#39;</span><span class="p">:</span>
            <span class="n">full_path</span> <span class="o">=</span> <span class="n">dir_name</span> <span class="o">+</span> <span class="s1">&#39;/&#39;</span> <span class="o">+</span> <span class="n">filename</span>
            <span class="n">file_noext</span> <span class="o">=</span> <span class="n">os</span><span class="o">.</span><span class="n">path</span><span class="o">.</span><span class="n">splitext</span><span class="p">(</span><span class="n">filename</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
            <span class="n">timestamp_OCR</span> <span class="o">=</span> <span class="n">time</span><span class="o">.</span><span class="n">strftime</span><span class="p">(</span><span class="s2">&quot;%Y-%m-</span><span class="si">%d</span><span class="s2">-%H%M_OCR_&quot;</span><span class="p">)</span>
            <span class="n">filename_OCR</span> <span class="o">=</span> <span class="n">timestamp_OCR</span> <span class="o">+</span> <span class="n">file_noext</span> <span class="o">+</span> <span class="s1">&#39;.pdf&#39;</span>
            <span class="n">docker_mount</span> <span class="o">=</span> <span class="n">dir_name</span> <span class="o">+</span> <span class="s1">&#39;:/home/docker&#39;</span>
<span class="c1"># create string for pdf processing</span>
<span class="c1"># diskstation needs a user:group docker:docker. find uid:gid of your diskstation docker:docker with id docker.</span>
<span class="c1"># use this uid:gid in -u flag</span>
<span class="c1"># rw rights for docker:docker at source dir are also necessary</span>
<span class="c1"># the script is processed as root user via chron</span>
            <span class="n">cmd</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;docker&#39;</span><span class="p">,</span> <span class="s1">&#39;run&#39;</span><span class="p">,</span> <span class="s1">&#39;--rm&#39;</span><span class="p">,</span> <span class="s1">&#39;-v&#39;</span><span class="p">,</span> <span class="n">docker_mount</span><span class="p">,</span> <span class="s1">&#39;-u=1030:65538&#39;</span><span class="p">,</span> <span class="s1">&#39;jbarlow83/ocrmypdf&#39;</span><span class="p">,</span> <span class="p">,</span> <span class="s1">&#39;--deskew&#39;</span> <span class="p">,</span> <span class="n">filename</span><span class="p">,</span> <span class="n">filename_OCR</span><span class="p">]</span>
            <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="n">cmd</span><span class="p">)</span>
            <span class="n">proc</span> <span class="o">=</span> <span class="n">subprocess</span><span class="o">.</span><span class="n">Popen</span><span class="p">(</span><span class="n">cmd</span><span class="p">,</span> <span class="n">stdout</span><span class="o">=</span><span class="n">subprocess</span><span class="o">.</span><span class="n">PIPE</span><span class="p">,</span> <span class="n">stderr</span><span class="o">=</span><span class="n">subprocess</span><span class="o">.</span><span class="n">STDOUT</span><span class="p">)</span>
            <span class="n">result</span> <span class="o">=</span> <span class="n">proc</span><span class="o">.</span><span class="n">stdout</span><span class="o">.</span><span class="n">read</span><span class="p">()</span>
            <span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="n">result</span><span class="p">)</span>
            <span class="n">full_path_OCR</span> <span class="o">=</span> <span class="n">dir_name</span> <span class="o">+</span> <span class="s1">&#39;/&#39;</span> <span class="o">+</span> <span class="n">filename_OCR</span>
            <span class="n">os</span><span class="o">.</span><span class="n">chmod</span><span class="p">(</span><span class="n">full_path_OCR</span><span class="p">,</span> <span class="mi">0</span><span class="n">o666</span><span class="p">)</span>
            <span class="n">os</span><span class="o">.</span><span class="n">chmod</span><span class="p">(</span><span class="n">full_path</span><span class="p">,</span> <span class="mi">0</span><span class="n">o666</span><span class="p">)</span>
            <span class="n">full_path_OCR_archive</span> <span class="o">=</span> <span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span>
            <span class="n">full_path_archive</span> <span class="o">=</span> <span class="n">sys</span><span class="o">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">2</span><span class="p">]</span> <span class="o">+</span> <span class="s1">&#39;/no_ocr&#39;</span>
            <span class="n">shutil</span><span class="o">.</span><span class="n">move</span><span class="p">(</span><span class="n">full_path_OCR</span><span class="p">,</span><span class="n">full_path_OCR_archive</span><span class="p">)</span>
            <span class="n">shutil</span><span class="o">.</span><span class="n">move</span><span class="p">(</span><span class="n">full_path</span><span class="p">,</span> <span class="n">full_path_archive</span><span class="p">)</span>
<span class="n">logging</span><span class="o">.</span><span class="n">info</span><span class="p">(</span><span class="s1">&#39;Finished.</span><span class="se">\n</span><span class="s1">&#39;</span><span class="p">)</span>
</pre></div>
</div>
</div>
<div class="section" id="huge-batch-jobs">
<h3>Huge batch jobs<a class="headerlink" href="#huge-batch-jobs" title="Permalink to this headline"></a></h3>
<p>If you have thousands of files to work with, contact the author. Consulting work related to OCRmyPDF helps fund this open source project and all inquiries are appreciated.</p>
</div>
</div>
<div class="section" id="hot-watched-folders">
<h2>Hot (watched) folders<a class="headerlink" href="#hot-watched-folders" title="Permalink to this headline"></a></h2>
<p>To set up a “hot folder” that will trigger OCR for every file inserted, use a program like Python <a class="reference external" href="https://pypi.python.org/pypi/watchdog">watchdog</a> (supports all major OS).</p>
<p>One could then configure a scanner to automatically place scanned files in a hot folder, so that they will be queued for OCR and copied to the destination.</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span>pip install watchdog
</pre></div>
</div>
<p>watchdog installs the command line program <code class="docutils literal"><span class="pre">watchmedo</span></code>, which can be told to run <code class="docutils literal"><span class="pre">ocrmypdf</span></code> on any .pdf added to the current directory (<code class="docutils literal"><span class="pre">.</span></code>) and place the result in the previously created <code class="docutils literal"><span class="pre">out/</span></code> folder.</p>
<div class="highlight-bash"><div class="highlight"><pre><span></span><span class="nb">cd</span> hot-folder
mkdir out
watchmedo shell-command <span class="se">\</span>
        --patterns<span class="o">=</span><span class="s2">&quot;*.pdf&quot;</span> <span class="se">\</span>
        --ignore-directories <span class="se">\</span>
        --command<span class="o">=</span><span class="s1">&#39;ocrmypdf &quot;${watch_src_path}&quot; &quot;out/${watch_src_path}&quot; &#39;</span> <span class="se">\</span>
        .  <span class="c1"># don&#39;t forget the final dot</span>
</pre></div>
</div>
<p>For more complex behavior you can write a Python script around to use the watchdog API.</p>
<p>On file servers, you could configure watchmedo as a system service so it will run all the time.</p>
<div class="section" id="caveats">
<h3>Caveats<a class="headerlink" href="#caveats" title="Permalink to this headline"></a></h3>
<ul class="simple">
<li><code class="docutils literal"><span class="pre">watchmedo</span></code> may not work properly on a networked file system, depending on the capabilities of the file system client and server.</li>
<li>This simple recipe does not filter for the type of file system event, so file copies, deletes and moves, and directory operations, will all be sent to ocrmypdf, producing errors in several cases. Disable your watched folder if you are doing anything other than copying files to it.</li>
<li>If the source and destination directory are the same, watchmedo may create an infinite loop.</li>
<li>On BSD, FreeBSD and older versions of macOS, you may need to increase the number of file descriptors to monitor more files, using <code class="docutils literal"><span class="pre">ulimit</span> <span class="pre">-n</span> <span class="pre">1024</span></code> to watch a folder of up to 1024 files.</li>
</ul>
</div>
<div class="section" id="alternatives">
<h3>Alternatives<a class="headerlink" href="#alternatives" title="Permalink to this headline"></a></h3>
<ul class="simple">
<li><a class="reference external" href="https://facebook.github.io/watchman/">Watchman</a> is a more powerful alternative to <code class="docutils literal"><span class="pre">watchmedo</span></code>.</li>
</ul>
</div>
</div>
</div>


           </div>
           <div class="articleComments">
            
           </div>
          </div>
          <footer>
  
    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
      
        <a href="security.html" class="btn btn-neutral float-right" title="PDF security issues" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right"></span></a>
      
      
        <a href="advanced.html" class="btn btn-neutral" title="Advanced features" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left"></span> Previous</a>
      
    </div>
  

  <hr/>

  <div role="contentinfo">
    <p>
        &copy; Copyright 2018, James R. Barlow. Licensed under Creative Commons Attribution-ShareAlike 4.0..

    </p>
  </div>
  Built with <a href="http://sphinx-doc.org/">Sphinx</a> using a <a href="https://github.com/snide/sphinx_rtd_theme">theme</a> provided by <a href="https://readthedocs.org">Read the Docs</a>. 

</footer>

        </div>
      </div>

    </section>

  </div>
  


  

    <script type="text/javascript">
        var DOCUMENTATION_OPTIONS = {
            URL_ROOT:'./',
            VERSION:'6.1.2',
            COLLAPSE_INDEX:false,
            FILE_SUFFIX:'.html',
            HAS_SOURCE:  true,
            SOURCELINK_SUFFIX: '.txt'
        };
    </script>
      <script type="text/javascript" src="_static/jquery.js"></script>
      <script type="text/javascript" src="_static/underscore.js"></script>
      <script type="text/javascript" src="_static/doctools.js"></script>

  

  
  
    <script type="text/javascript" src="_static/js/theme.js"></script>
  

  
  
  <script type="text/javascript">
      jQuery(function () {
          SphinxRtdTheme.StickyNav.enable();
      });
  </script>
   

</body>
</html>