forked from nm3clol/nm3clol-docker
45 lines
2.0 KiB
XML
45 lines
2.0 KiB
XML
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
|
|
<!--
|
|
~ Licensed to the Apache Software Foundation (ASF) under one or more
|
|
~ contributor license agreements. See the NOTICE file distributed with
|
|
~ this work for additional information regarding copyright ownership.
|
|
~ The ASF licenses this file to You under the Apache License, Version 2.0
|
|
~ (the "License"); you may not use this file except in compliance with
|
|
~ the License. You may obtain a copy of the License at
|
|
~
|
|
~ http://www.apache.org/licenses/LICENSE-2.0
|
|
~
|
|
~ Unless required by applicable law or agreed to in writing, software
|
|
~ distributed under the License is distributed on an "AS IS" BASIS,
|
|
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
~ See the License for the specific language governing permissions and
|
|
~ limitations under the License.
|
|
-->
|
|
<properties>
|
|
<parsers>
|
|
<!-- Load TesseractOCRParser (could use DefaultParser if you want others too) -->
|
|
<parser class="org.apache.tika.parser.ocr.TesseractOCRParser"/>
|
|
|
|
<!-- OCR on Rendered Pages -->
|
|
<parser class="org.apache.tika.parser.pdf.PDFParser">
|
|
<params>
|
|
<!-- no_ocr - extract text only
|
|
ocr_only - don't extract text and just attempt OCR
|
|
ocr_and_text - extract text and attempt OCR (from Tika 1.24)
|
|
auto - extract text but if < 10 characters try OCR
|
|
-->
|
|
<param name="ocrStrategy" type="string">ocr_and_text</param>
|
|
<param name="ocrImageType" type="string">rgb</param>
|
|
<param name="ocrDPI" type="int">100</param>
|
|
</params>
|
|
</parser>
|
|
</parsers>
|
|
<server>
|
|
<params>
|
|
<logLevel>debug</logLevel>
|
|
<!-- maximum time to allow per parse before shutting down and restarting
|
|
the forked parser. Not allowed if nofork=true. -->
|
|
<taskTimeoutMillis>40000000</taskTimeoutMillis>
|
|
</params>
|
|
</server>
|
|
</properties> |