1396 lines
40 KiB
HTML
1396 lines
40 KiB
HTML
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
|
<meta name="generator" content="AsciiDoc 10.2.0">
|
|
<title>TESSERACT(1)</title>
|
|
<style type="text/css">
|
|
/* Shared CSS for AsciiDoc xhtml11 and html5 backends */
|
|
|
|
/* Default font. */
|
|
body {
|
|
font-family: Georgia,serif;
|
|
}
|
|
|
|
/* Title font. */
|
|
h1, h2, h3, h4, h5, h6,
|
|
div.title, caption.title,
|
|
thead, p.table.header,
|
|
#toctitle,
|
|
#author, #revnumber, #revdate, #revremark,
|
|
#footer {
|
|
font-family: Arial,Helvetica,sans-serif;
|
|
}
|
|
|
|
body {
|
|
margin: 1em 5% 1em 5%;
|
|
}
|
|
|
|
a {
|
|
color: blue;
|
|
text-decoration: underline;
|
|
}
|
|
a:visited {
|
|
color: fuchsia;
|
|
}
|
|
|
|
em {
|
|
font-style: italic;
|
|
color: navy;
|
|
}
|
|
|
|
strong {
|
|
font-weight: bold;
|
|
color: #083194;
|
|
}
|
|
|
|
h1, h2, h3, h4, h5, h6 {
|
|
color: #527bbd;
|
|
margin-top: 1.2em;
|
|
margin-bottom: 0.5em;
|
|
line-height: 1.3;
|
|
}
|
|
|
|
h1, h2, h3 {
|
|
border-bottom: 2px solid silver;
|
|
}
|
|
h2 {
|
|
padding-top: 0.5em;
|
|
}
|
|
h3 {
|
|
float: left;
|
|
}
|
|
h3 + * {
|
|
clear: left;
|
|
}
|
|
h5 {
|
|
font-size: 1.0em;
|
|
}
|
|
|
|
div.sectionbody {
|
|
margin-left: 0;
|
|
}
|
|
|
|
hr {
|
|
border: 1px solid silver;
|
|
}
|
|
|
|
p {
|
|
margin-top: 0.5em;
|
|
margin-bottom: 0.5em;
|
|
}
|
|
|
|
ul, ol, li > p {
|
|
margin-top: 0;
|
|
}
|
|
ul > li { color: #aaa; }
|
|
ul > li > * { color: black; }
|
|
|
|
.monospaced, code, pre {
|
|
font-family: "Courier New", Courier, monospace;
|
|
font-size: inherit;
|
|
color: navy;
|
|
padding: 0;
|
|
margin: 0;
|
|
}
|
|
pre {
|
|
white-space: pre-wrap;
|
|
}
|
|
|
|
#author {
|
|
color: #527bbd;
|
|
font-weight: bold;
|
|
font-size: 1.1em;
|
|
}
|
|
#email {
|
|
}
|
|
#revnumber, #revdate, #revremark {
|
|
}
|
|
|
|
#footer {
|
|
font-size: small;
|
|
border-top: 2px solid silver;
|
|
padding-top: 0.5em;
|
|
margin-top: 4.0em;
|
|
}
|
|
#footer-text {
|
|
float: left;
|
|
padding-bottom: 0.5em;
|
|
}
|
|
#footer-badges {
|
|
float: right;
|
|
padding-bottom: 0.5em;
|
|
}
|
|
|
|
#preamble {
|
|
margin-top: 1.5em;
|
|
margin-bottom: 1.5em;
|
|
}
|
|
div.imageblock, div.exampleblock, div.verseblock,
|
|
div.quoteblock, div.literalblock, div.listingblock, div.sidebarblock,
|
|
div.admonitionblock {
|
|
margin-top: 1.0em;
|
|
margin-bottom: 1.5em;
|
|
}
|
|
div.admonitionblock {
|
|
margin-top: 2.0em;
|
|
margin-bottom: 2.0em;
|
|
margin-right: 10%;
|
|
color: #606060;
|
|
}
|
|
|
|
div.content { /* Block element content. */
|
|
padding: 0;
|
|
}
|
|
|
|
/* Block element titles. */
|
|
div.title, caption.title {
|
|
color: #527bbd;
|
|
font-weight: bold;
|
|
text-align: left;
|
|
margin-top: 1.0em;
|
|
margin-bottom: 0.5em;
|
|
}
|
|
div.title + * {
|
|
margin-top: 0;
|
|
}
|
|
|
|
td div.title:first-child {
|
|
margin-top: 0.0em;
|
|
}
|
|
div.content div.title:first-child {
|
|
margin-top: 0.0em;
|
|
}
|
|
div.content + div.title {
|
|
margin-top: 0.0em;
|
|
}
|
|
|
|
div.sidebarblock > div.content {
|
|
background: #ffffee;
|
|
border: 1px solid #dddddd;
|
|
border-left: 4px solid #f0f0f0;
|
|
padding: 0.5em;
|
|
}
|
|
|
|
div.listingblock > div.content {
|
|
border: 1px solid #dddddd;
|
|
border-left: 5px solid #f0f0f0;
|
|
background: #f8f8f8;
|
|
padding: 0.5em;
|
|
}
|
|
|
|
div.quoteblock, div.verseblock {
|
|
padding-left: 1.0em;
|
|
margin-left: 1.0em;
|
|
margin-right: 10%;
|
|
border-left: 5px solid #f0f0f0;
|
|
color: #888;
|
|
}
|
|
|
|
div.quoteblock > div.attribution {
|
|
padding-top: 0.5em;
|
|
text-align: right;
|
|
}
|
|
|
|
div.verseblock > pre.content {
|
|
font-family: inherit;
|
|
font-size: inherit;
|
|
}
|
|
div.verseblock > div.attribution {
|
|
padding-top: 0.75em;
|
|
text-align: left;
|
|
}
|
|
/* DEPRECATED: Pre version 8.2.7 verse style literal block. */
|
|
div.verseblock + div.attribution {
|
|
text-align: left;
|
|
}
|
|
|
|
div.admonitionblock .icon {
|
|
vertical-align: top;
|
|
font-size: 1.1em;
|
|
font-weight: bold;
|
|
text-decoration: underline;
|
|
color: #527bbd;
|
|
padding-right: 0.5em;
|
|
}
|
|
div.admonitionblock td.content {
|
|
padding-left: 0.5em;
|
|
border-left: 3px solid #dddddd;
|
|
}
|
|
|
|
div.exampleblock > div.content {
|
|
border-left: 3px solid #dddddd;
|
|
padding-left: 0.5em;
|
|
}
|
|
|
|
div.imageblock div.content { padding-left: 0; }
|
|
span.image img { border-style: none; vertical-align: text-bottom; }
|
|
a.image:visited { color: white; }
|
|
|
|
dl {
|
|
margin-top: 0.8em;
|
|
margin-bottom: 0.8em;
|
|
}
|
|
dt {
|
|
margin-top: 0.5em;
|
|
margin-bottom: 0;
|
|
font-style: normal;
|
|
color: navy;
|
|
}
|
|
dd > *:first-child {
|
|
margin-top: 0.1em;
|
|
}
|
|
|
|
ul, ol {
|
|
list-style-position: outside;
|
|
}
|
|
ol.arabic {
|
|
list-style-type: decimal;
|
|
}
|
|
ol.loweralpha {
|
|
list-style-type: lower-alpha;
|
|
}
|
|
ol.upperalpha {
|
|
list-style-type: upper-alpha;
|
|
}
|
|
ol.lowerroman {
|
|
list-style-type: lower-roman;
|
|
}
|
|
ol.upperroman {
|
|
list-style-type: upper-roman;
|
|
}
|
|
|
|
div.compact ul, div.compact ol,
|
|
div.compact p, div.compact p,
|
|
div.compact div, div.compact div {
|
|
margin-top: 0.1em;
|
|
margin-bottom: 0.1em;
|
|
}
|
|
|
|
tfoot {
|
|
font-weight: bold;
|
|
}
|
|
td > div.verse {
|
|
white-space: pre;
|
|
}
|
|
|
|
div.hdlist {
|
|
margin-top: 0.8em;
|
|
margin-bottom: 0.8em;
|
|
}
|
|
div.hdlist tr {
|
|
padding-bottom: 15px;
|
|
}
|
|
dt.hdlist1.strong, td.hdlist1.strong {
|
|
font-weight: bold;
|
|
}
|
|
td.hdlist1 {
|
|
vertical-align: top;
|
|
font-style: normal;
|
|
padding-right: 0.8em;
|
|
color: navy;
|
|
}
|
|
td.hdlist2 {
|
|
vertical-align: top;
|
|
}
|
|
div.hdlist.compact tr {
|
|
margin: 0;
|
|
padding-bottom: 0;
|
|
}
|
|
|
|
.comment {
|
|
background: yellow;
|
|
}
|
|
|
|
.footnote, .footnoteref {
|
|
font-size: 0.8em;
|
|
}
|
|
|
|
span.footnote, span.footnoteref {
|
|
vertical-align: super;
|
|
}
|
|
|
|
#footnotes {
|
|
margin: 20px 0 20px 0;
|
|
padding: 7px 0 0 0;
|
|
}
|
|
|
|
#footnotes div.footnote {
|
|
margin: 0 0 5px 0;
|
|
}
|
|
|
|
#footnotes hr {
|
|
border: none;
|
|
border-top: 1px solid silver;
|
|
height: 1px;
|
|
text-align: left;
|
|
margin-left: 0;
|
|
width: 20%;
|
|
min-width: 100px;
|
|
}
|
|
|
|
div.colist td {
|
|
padding-right: 0.5em;
|
|
padding-bottom: 0.3em;
|
|
vertical-align: top;
|
|
}
|
|
div.colist td img {
|
|
margin-top: 0.3em;
|
|
}
|
|
|
|
@media print {
|
|
#footer-badges { display: none; }
|
|
}
|
|
|
|
#toc {
|
|
margin-bottom: 2.5em;
|
|
}
|
|
|
|
#toctitle {
|
|
color: #527bbd;
|
|
font-size: 1.1em;
|
|
font-weight: bold;
|
|
margin-top: 1.0em;
|
|
margin-bottom: 0.1em;
|
|
}
|
|
|
|
div.toclevel0, div.toclevel1, div.toclevel2, div.toclevel3, div.toclevel4 {
|
|
margin-top: 0;
|
|
margin-bottom: 0;
|
|
}
|
|
div.toclevel2 {
|
|
margin-left: 2em;
|
|
font-size: 0.9em;
|
|
}
|
|
div.toclevel3 {
|
|
margin-left: 4em;
|
|
font-size: 0.9em;
|
|
}
|
|
div.toclevel4 {
|
|
margin-left: 6em;
|
|
font-size: 0.9em;
|
|
}
|
|
|
|
span.aqua { color: aqua; }
|
|
span.black { color: black; }
|
|
span.blue { color: blue; }
|
|
span.fuchsia { color: fuchsia; }
|
|
span.gray { color: gray; }
|
|
span.green { color: green; }
|
|
span.lime { color: lime; }
|
|
span.maroon { color: maroon; }
|
|
span.navy { color: navy; }
|
|
span.olive { color: olive; }
|
|
span.purple { color: purple; }
|
|
span.red { color: red; }
|
|
span.silver { color: silver; }
|
|
span.teal { color: teal; }
|
|
span.white { color: white; }
|
|
span.yellow { color: yellow; }
|
|
|
|
span.aqua-background { background: aqua; }
|
|
span.black-background { background: black; }
|
|
span.blue-background { background: blue; }
|
|
span.fuchsia-background { background: fuchsia; }
|
|
span.gray-background { background: gray; }
|
|
span.green-background { background: green; }
|
|
span.lime-background { background: lime; }
|
|
span.maroon-background { background: maroon; }
|
|
span.navy-background { background: navy; }
|
|
span.olive-background { background: olive; }
|
|
span.purple-background { background: purple; }
|
|
span.red-background { background: red; }
|
|
span.silver-background { background: silver; }
|
|
span.teal-background { background: teal; }
|
|
span.white-background { background: white; }
|
|
span.yellow-background { background: yellow; }
|
|
|
|
span.big { font-size: 2em; }
|
|
span.small { font-size: 0.6em; }
|
|
|
|
span.underline { text-decoration: underline; }
|
|
span.overline { text-decoration: overline; }
|
|
span.line-through { text-decoration: line-through; }
|
|
|
|
div.unbreakable { page-break-inside: avoid; }
|
|
|
|
|
|
/*
|
|
* xhtml11 specific
|
|
*
|
|
* */
|
|
|
|
div.tableblock {
|
|
margin-top: 1.0em;
|
|
margin-bottom: 1.5em;
|
|
}
|
|
div.tableblock > table {
|
|
border: 3px solid #527bbd;
|
|
}
|
|
thead, p.table.header {
|
|
font-weight: bold;
|
|
color: #527bbd;
|
|
}
|
|
p.table {
|
|
margin-top: 0;
|
|
}
|
|
/* Because the table frame attribute is overridden by CSS in most browsers. */
|
|
div.tableblock > table[frame="void"] {
|
|
border-style: none;
|
|
}
|
|
div.tableblock > table[frame="hsides"] {
|
|
border-left-style: none;
|
|
border-right-style: none;
|
|
}
|
|
div.tableblock > table[frame="vsides"] {
|
|
border-top-style: none;
|
|
border-bottom-style: none;
|
|
}
|
|
|
|
|
|
/*
|
|
* html5 specific
|
|
*
|
|
* */
|
|
|
|
table.tableblock {
|
|
margin-top: 1.0em;
|
|
margin-bottom: 1.5em;
|
|
}
|
|
thead, p.tableblock.header {
|
|
font-weight: bold;
|
|
color: #527bbd;
|
|
}
|
|
p.tableblock {
|
|
margin-top: 0;
|
|
}
|
|
table.tableblock {
|
|
border-width: 3px;
|
|
border-spacing: 0px;
|
|
border-style: solid;
|
|
border-color: #527bbd;
|
|
border-collapse: collapse;
|
|
}
|
|
th.tableblock, td.tableblock {
|
|
border-width: 1px;
|
|
padding: 4px;
|
|
border-style: solid;
|
|
border-color: #527bbd;
|
|
}
|
|
|
|
table.tableblock.frame-topbot {
|
|
border-left-style: hidden;
|
|
border-right-style: hidden;
|
|
}
|
|
table.tableblock.frame-sides {
|
|
border-top-style: hidden;
|
|
border-bottom-style: hidden;
|
|
}
|
|
table.tableblock.frame-none {
|
|
border-style: hidden;
|
|
}
|
|
|
|
th.tableblock.halign-left, td.tableblock.halign-left {
|
|
text-align: left;
|
|
}
|
|
th.tableblock.halign-center, td.tableblock.halign-center {
|
|
text-align: center;
|
|
}
|
|
th.tableblock.halign-right, td.tableblock.halign-right {
|
|
text-align: right;
|
|
}
|
|
|
|
th.tableblock.valign-top, td.tableblock.valign-top {
|
|
vertical-align: top;
|
|
}
|
|
th.tableblock.valign-middle, td.tableblock.valign-middle {
|
|
vertical-align: middle;
|
|
}
|
|
th.tableblock.valign-bottom, td.tableblock.valign-bottom {
|
|
vertical-align: bottom;
|
|
}
|
|
|
|
|
|
/*
|
|
* manpage specific
|
|
*
|
|
* */
|
|
|
|
body.manpage h1 {
|
|
padding-top: 0.5em;
|
|
padding-bottom: 0.5em;
|
|
border-top: 2px solid silver;
|
|
border-bottom: 2px solid silver;
|
|
}
|
|
body.manpage h2 {
|
|
border-style: none;
|
|
}
|
|
body.manpage div.sectionbody {
|
|
margin-left: 3em;
|
|
}
|
|
|
|
@media print {
|
|
body.manpage div#toc { display: none; }
|
|
}
|
|
|
|
|
|
</style>
|
|
<script type="text/javascript">
|
|
/*<+'])');
|
|
// Function that scans the DOM tree for header elements (the DOM2
|
|
// nodeIterator API would be a better technique but not supported by all
|
|
// browsers).
|
|
var iterate = function (el) {
|
|
for (var i = el.firstChild; i != null; i = i.nextSibling) {
|
|
if (i.nodeType == 1 /* Node.ELEMENT_NODE */) {
|
|
var mo = re.exec(i.tagName);
|
|
if (mo && (i.getAttribute("class") || i.getAttribute("className")) != "float") {
|
|
result[result.length] = new TocEntry(i, getText(i), mo[1]-1);
|
|
}
|
|
iterate(i);
|
|
}
|
|
}
|
|
}
|
|
iterate(el);
|
|
return result;
|
|
}
|
|
|
|
var toc = document.getElementById("toc");
|
|
if (!toc) {
|
|
return;
|
|
}
|
|
|
|
// Delete existing TOC entries in case we're reloading the TOC.
|
|
var tocEntriesToRemove = [];
|
|
var i;
|
|
for (i = 0; i < toc.childNodes.length; i++) {
|
|
var entry = toc.childNodes[i];
|
|
if (entry.nodeName.toLowerCase() == 'div'
|
|
&& entry.getAttribute("class")
|
|
&& entry.getAttribute("class").match(/^toclevel/))
|
|
tocEntriesToRemove.push(entry);
|
|
}
|
|
for (i = 0; i < tocEntriesToRemove.length; i++) {
|
|
toc.removeChild(tocEntriesToRemove[i]);
|
|
}
|
|
|
|
// Rebuild TOC entries.
|
|
var entries = tocEntries(document.getElementById("content"), toclevels);
|
|
for (var i = 0; i < entries.length; ++i) {
|
|
var entry = entries[i];
|
|
if (entry.element.id == "")
|
|
entry.element.id = "_toc_" + i;
|
|
var a = document.createElement("a");
|
|
a.href = "#" + entry.element.id;
|
|
a.appendChild(document.createTextNode(entry.text));
|
|
var div = document.createElement("div");
|
|
div.appendChild(a);
|
|
div.className = "toclevel" + entry.toclevel;
|
|
toc.appendChild(div);
|
|
}
|
|
if (entries.length == 0)
|
|
toc.parentNode.removeChild(toc);
|
|
},
|
|
|
|
|
|
/////////////////////////////////////////////////////////////////////
|
|
// Footnotes generator
|
|
/////////////////////////////////////////////////////////////////////
|
|
|
|
/* Based on footnote generation code from:
|
|
* http://www.brandspankingnew.net/archive/2005/07/format_footnote.html
|
|
*/
|
|
|
|
footnotes: function () {
|
|
// Delete existing footnote entries in case we're reloading the footnodes.
|
|
var i;
|
|
var noteholder = document.getElementById("footnotes");
|
|
if (!noteholder) {
|
|
return;
|
|
}
|
|
var entriesToRemove = [];
|
|
for (i = 0; i < noteholder.childNodes.length; i++) {
|
|
var entry = noteholder.childNodes[i];
|
|
if (entry.nodeName.toLowerCase() == 'div' && entry.getAttribute("class") == "footnote")
|
|
entriesToRemove.push(entry);
|
|
}
|
|
for (i = 0; i < entriesToRemove.length; i++) {
|
|
noteholder.removeChild(entriesToRemove[i]);
|
|
}
|
|
|
|
// Rebuild footnote entries.
|
|
var cont = document.getElementById("content");
|
|
var spans = cont.getElementsByTagName("span");
|
|
var refs = {};
|
|
var n = 0;
|
|
for (i=0; i<spans.length; i++) {
|
|
if (spans[i].className == "footnote") {
|
|
n++;
|
|
var note = spans[i].getAttribute("data-note");
|
|
if (!note) {
|
|
// Use [\s\S] in place of . so multi-line matches work.
|
|
// Because JavaScript has no s (dotall) regex flag.
|
|
note = spans[i].innerHTML.match(/\s*\[([\s\S]*)]\s*/)[1];
|
|
spans[i].innerHTML =
|
|
"[<a id='_footnoteref_" + n + "' href='#_footnote_" + n +
|
|
"' title='View footnote' class='footnote'>" + n + "</a>]";
|
|
spans[i].setAttribute("data-note", note);
|
|
}
|
|
noteholder.innerHTML +=
|
|
"<div class='footnote' id='_footnote_" + n + "'>" +
|
|
"<a href='#_footnoteref_" + n + "' title='Return to text'>" +
|
|
n + "</a>. " + note + "</div>";
|
|
var id =spans[i].getAttribute("id");
|
|
if (id != null) refs["#"+id] = n;
|
|
}
|
|
}
|
|
if (n == 0)
|
|
noteholder.parentNode.removeChild(noteholder);
|
|
else {
|
|
// Process footnoterefs.
|
|
for (i=0; i<spans.length; i++) {
|
|
if (spans[i].className == "footnoteref") {
|
|
var href = spans[i].getElementsByTagName("a")[0].getAttribute("href");
|
|
href = href.match(/#.*/)[0]; // Because IE return full URL.
|
|
n = refs[href];
|
|
spans[i].innerHTML =
|
|
"[<a href='#_footnote_" + n +
|
|
"' title='View footnote' class='footnote'>" + n + "</a>]";
|
|
}
|
|
}
|
|
}
|
|
},
|
|
|
|
install: function(toclevels) {
|
|
var timerId;
|
|
|
|
function reinstall() {
|
|
asciidoc.footnotes();
|
|
if (toclevels) {
|
|
asciidoc.toc(toclevels);
|
|
}
|
|
}
|
|
|
|
function reinstallAndRemoveTimer() {
|
|
clearInterval(timerId);
|
|
reinstall();
|
|
}
|
|
|
|
timerId = setInterval(reinstall, 500);
|
|
if (document.addEventListener)
|
|
document.addEventListener("DOMContentLoaded", reinstallAndRemoveTimer, false);
|
|
else
|
|
window.onload = reinstallAndRemoveTimer;
|
|
}
|
|
|
|
}
|
|
asciidoc.install();
|
|
/*]]>*/
|
|
</script>
|
|
</head>
|
|
<body class="manpage">
|
|
<div id="header">
|
|
<h1>
|
|
TESSERACT(1) Manual Page
|
|
</h1>
|
|
<h2>NAME</h2>
|
|
<div class="sectionbody">
|
|
<p>tesseract -
|
|
command-line OCR engine
|
|
</p>
|
|
</div>
|
|
</div>
|
|
<div id="content">
|
|
<div class="sect1">
|
|
<h2 id="_synopsis">SYNOPSIS</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p><strong>tesseract</strong> <em>FILE</em> <em>OUTPUTBASE</em> [<em>OPTIONS</em>]… [<em>CONFIGFILE</em>]…</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_description">DESCRIPTION</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>tesseract(1) is a commercial quality OCR engine originally developed at HP
|
|
between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by
|
|
UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed
|
|
at Google until 2018.</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_in_out_arguments">IN/OUT ARGUMENTS</h2>
|
|
<div class="sectionbody">
|
|
<div class="dlist"><dl>
|
|
<dt class="hdlist1">
|
|
<em>FILE</em>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
The name of the input file.
|
|
This can either be an image file or a text file.<br>
|
|
Most image file formats (anything readable by Leptonica) are supported.<br>
|
|
A text file lists the names of all input images (one image name per line).
|
|
The results will be combined in a single file for each output file format
|
|
(txt, pdf, hocr, xml).<br>
|
|
If <em>FILE</em> is <span class="monospaced">stdin</span> or <span class="monospaced">-</span> then the standard input is used.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<em>OUTPUTBASE</em>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
The basename of the output file (to which the appropriate extension
|
|
will be appended). By default the output will be a text file
|
|
with <span class="monospaced">.txt</span> added to the basename unless there are one or more
|
|
parameters set which explicitly specify the desired output.<br>
|
|
If <em>OUTPUTBASE</em> is <span class="monospaced">stdout</span> or <span class="monospaced">-</span> then the standard output is used.
|
|
</p>
|
|
</dd>
|
|
</dl></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="TESSDATADIR">OPTIONS</h2>
|
|
<div class="sectionbody">
|
|
<div class="dlist"><dl>
|
|
<dt class="hdlist1">
|
|
<strong>-c</strong> <em>CONFIGVAR=VALUE</em>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Set value for parameter <em>CONFIGVAR</em> to VALUE. Multiple <strong>-c</strong> arguments are allowed.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>--dpi</strong> <em>N</em>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Specify the resolution <em>N</em> in DPI for the input image(s).
|
|
A typical value for <em>N</em> is <span class="monospaced">300</span>. Without this option,
|
|
the resolution is read from the metadata included in the image.
|
|
If an image does not include that information, Tesseract tries to guess it.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>-l</strong> <em>LANG</em>
|
|
</dt>
|
|
<dt class="hdlist1">
|
|
<strong>-l</strong> <em>SCRIPT</em>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
The language or script to use.
|
|
If none is specified, <span class="monospaced">eng</span> (English) is assumed.
|
|
Multiple languages may be specified, separated by plus characters.
|
|
Tesseract uses 3-character ISO 639-2 language codes
|
|
(see <a href="#LANGUAGES"><strong>LANGUAGES AND SCRIPTS</strong></a>).
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>--psm</strong> <em>N</em>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Set Tesseract to only run a subset of layout analysis and assume
|
|
a certain form of image. The options for <em>N</em> are:
|
|
</p>
|
|
<div class="literalblock">
|
|
<div class="content monospaced">
|
|
<pre>0 = Orientation and script detection (OSD) only.
|
|
1 = Automatic page segmentation with OSD.
|
|
2 = Automatic page segmentation, but no OSD, or OCR. (not implemented)
|
|
3 = Fully automatic page segmentation, but no OSD. (Default)
|
|
4 = Assume a single column of text of variable sizes.
|
|
5 = Assume a single uniform block of vertically aligned text.
|
|
6 = Assume a single uniform block of text.
|
|
7 = Treat the image as a single text line.
|
|
8 = Treat the image as a single word.
|
|
9 = Treat the image as a single word in a circle.
|
|
10 = Treat the image as a single character.
|
|
11 = Sparse text. Find as much text as possible in no particular order.
|
|
12 = Sparse text with OSD.
|
|
13 = Raw line. Treat the image as a single text line,
|
|
bypassing hacks that are Tesseract-specific.</pre>
|
|
</div></div>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>--oem</strong> <em>N</em>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Specify OCR Engine mode. The options for <em>N</em> are:
|
|
</p>
|
|
<div class="literalblock">
|
|
<div class="content monospaced">
|
|
<pre>0 = Original Tesseract only.
|
|
1 = Neural nets LSTM only.
|
|
2 = Tesseract + LSTM.
|
|
3 = Default, based on what is available.</pre>
|
|
</div></div>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>--tessdata-dir</strong> <em>PATH</em>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Specify the location of tessdata path.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>--user-patterns</strong> <em>FILE</em>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Specify the location of user patterns file.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>--user-words</strong> <em>FILE</em>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Specify the location of user words file.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<em>CONFIGFILE</em>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
The name of a config to use. The name can be a file in <span class="monospaced">tessdata/configs</span>
|
|
or <span class="monospaced">tessdata/tessconfigs</span>, or an absolute or relative file path.
|
|
A config is a plain text file which contains a list of parameters and
|
|
their values, one per line, with a space separating parameter from value.<br>
|
|
Interesting config files include:
|
|
</p>
|
|
<div class="ulist" id="CONFIGFILE"><ul>
|
|
<li>
|
|
<p>
|
|
<strong>alto</strong> — Output in ALTO format (<em>OUTPUTBASE</em><span class="monospaced">.xml</span>).
|
|
</p>
|
|
</li>
|
|
<li>
|
|
<p>
|
|
<strong>hocr</strong> — Output in hOCR format (<em>OUTPUTBASE</em><span class="monospaced">.hocr</span>).
|
|
</p>
|
|
</li>
|
|
<li>
|
|
<p>
|
|
<strong>page</strong> — Output in PAGE format (<em>OUTPUTBASE</em><span class="monospaced">.page.xml</span>).
|
|
The output can be customized with the flags:
|
|
page_xml_polygon — Create polygons instead of bounding boxes (default: true)
|
|
page_xml_level — Create the PAGE file on 0=linelevel or 1=wordlevel (default: 0)
|
|
</p>
|
|
</li>
|
|
<li>
|
|
<p>
|
|
<strong>pdf</strong> — Output PDF (<em>OUTPUTBASE</em><span class="monospaced">.pdf</span>).
|
|
</p>
|
|
</li>
|
|
<li>
|
|
<p>
|
|
<strong>tsv</strong> — Output TSV (<em>OUTPUTBASE</em><span class="monospaced">.tsv</span>).
|
|
</p>
|
|
</li>
|
|
<li>
|
|
<p>
|
|
<strong>txt</strong> — Output plain text (<em>OUTPUTBASE</em><span class="monospaced">.txt</span>).
|
|
</p>
|
|
</li>
|
|
<li>
|
|
<p>
|
|
<strong>get.images</strong> — Write processed input images to file (<em>OUTPUTBASE</em><span class="monospaced">.processedPAGENUMBER.tif</span>).
|
|
</p>
|
|
</li>
|
|
<li>
|
|
<p>
|
|
<strong>logfile</strong> — Redirect debug messages to file (<span class="monospaced">tesseract.log</span>).
|
|
</p>
|
|
</li>
|
|
<li>
|
|
<p>
|
|
<strong>lstm.train</strong> — Output files used by LSTM training (<em>OUTPUTBASE</em><span class="monospaced">.lstmf</span>).
|
|
</p>
|
|
</li>
|
|
<li>
|
|
<p>
|
|
<strong>makebox</strong> — Write box file (<em>OUTPUTBASE</em><span class="monospaced">.box</span>).
|
|
</p>
|
|
</li>
|
|
<li>
|
|
<p>
|
|
<strong>quiet</strong> — Redirect debug messages to <em>/dev/null</em>.
|
|
</p>
|
|
</li>
|
|
</ul></div>
|
|
</dd>
|
|
</dl></div>
|
|
<div class="paragraph"><p>It is possible to select several config files, for example
|
|
<span class="monospaced">tesseract image.png demo alto hocr pdf txt</span> will create four output files
|
|
<span class="monospaced">demo.alto</span>, <span class="monospaced">demo.hocr</span>, <span class="monospaced">demo.pdf</span> and <span class="monospaced">demo.txt</span> with the OCR results.</p></div>
|
|
<div class="paragraph"><p><strong>Nota bene:</strong> The options <strong>-l</strong> <em>LANG</em>, <strong>-l</strong> <em>SCRIPT</em> and <strong>--psm</strong> <em>N</em>
|
|
must occur before any <em>CONFIGFILE</em>.</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_single_options">SINGLE OPTIONS</h2>
|
|
<div class="sectionbody">
|
|
<div class="dlist"><dl>
|
|
<dt class="hdlist1">
|
|
<strong>-h, --help</strong>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Show help message.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>--help-extra</strong>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Show extra help for advanced users.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>--help-psm</strong>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Show page segmentation modes.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>--help-oem</strong>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Show OCR Engine modes.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>-v, --version</strong>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Returns the current version of the tesseract(1) executable.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>--list-langs</strong>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
List available languages for tesseract engine.
|
|
Can be used with <strong>--tessdata-dir</strong> <em>PATH</em>.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong>--print-parameters</strong>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
Print tesseract parameters.
|
|
</p>
|
|
</dd>
|
|
</dl></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="LANGUAGES">LANGUAGES AND SCRIPTS</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>To recognize some text with Tesseract, it is normally necessary to specify
|
|
the language(s) or script(s) of the text (unless it is English text which is
|
|
supported by default) using <strong>-l</strong> <em>LANG</em> or <strong>-l</strong> <em>SCRIPT</em>.</p></div>
|
|
<div class="paragraph"><p>Selecting a language automatically also selects the language specific
|
|
character set and dictionary (word list).</p></div>
|
|
<div class="paragraph"><p>Selecting a script typically selects all characters of that script
|
|
which can be from different languages. The dictionary which is included
|
|
also contains a mix from different languages.
|
|
In most cases, a script also supports English.
|
|
So it is possible to recognize a language that has not been specifically
|
|
trained for by using traineddata for the script it is written in.</p></div>
|
|
<div class="paragraph"><p>More than one language or script may be specified by using <span class="monospaced">+</span>.
|
|
Example: <span class="monospaced">tesseract myimage.png myimage -l eng+deu+fra</span>.</p></div>
|
|
<div class="paragraph"><p><a href="https://github.com/tesseract-ocr/tessdata_fast">https://github.com/tesseract-ocr/tessdata_fast</a> provides fast language and
|
|
script models which are also part of Linux distributions.</p></div>
|
|
<div class="paragraph"><p>For Tesseract 4, <span class="monospaced">tessdata_fast</span> includes traineddata files for the
|
|
following languages:</p></div>
|
|
<div class="paragraph"><p><strong>afr</strong> (Afrikaans),
|
|
<strong>amh</strong> (Amharic),
|
|
<strong>ara</strong> (Arabic),
|
|
<strong>asm</strong> (Assamese),
|
|
<strong>aze</strong> (Azerbaijani),
|
|
<strong>aze_cyrl</strong> (Azerbaijani - Cyrilic),
|
|
<strong>bel</strong> (Belarusian),
|
|
<strong>ben</strong> (Bengali),
|
|
<strong>bod</strong> (Tibetan),
|
|
<strong>bos</strong> (Bosnian),
|
|
<strong>bre</strong> (Breton),
|
|
<strong>bul</strong> (Bulgarian),
|
|
<strong>cat</strong> (Catalan; Valencian),
|
|
<strong>ceb</strong> (Cebuano),
|
|
<strong>ces</strong> (Czech),
|
|
<strong>chi_sim</strong> (Chinese simplified),
|
|
<strong>chi_tra</strong> (Chinese traditional),
|
|
<strong>chr</strong> (Cherokee),
|
|
<strong>cos</strong> (Corsican),
|
|
<strong>cym</strong> (Welsh),
|
|
<strong>dan</strong> (Danish),
|
|
<strong>deu</strong> (German),
|
|
<strong>deu_latf</strong> (German Fraktur Latin),
|
|
<strong>div</strong> (Dhivehi),
|
|
<strong>dzo</strong> (Dzongkha),
|
|
<strong>ell</strong> (Greek, Modern, 1453-),
|
|
<strong>eng</strong> (English),
|
|
<strong>enm</strong> (English, Middle, 1100-1500),
|
|
<strong>epo</strong> (Esperanto),
|
|
<strong>equ</strong> (Math / equation detection module),
|
|
<strong>est</strong> (Estonian),
|
|
<strong>eus</strong> (Basque),
|
|
<strong>fas</strong> (Persian),
|
|
<strong>fao</strong> (Faroese),
|
|
<strong>fil</strong> (Filipino),
|
|
<strong>fin</strong> (Finnish),
|
|
<strong>fra</strong> (French),
|
|
<strong>frm</strong> (French, Middle, ca.1400-1600),
|
|
<strong>fry</strong> (West Frisian),
|
|
<strong>gla</strong> (Scottish Gaelic),
|
|
<strong>gle</strong> (Irish),
|
|
<strong>glg</strong> (Galician),
|
|
<strong>grc</strong> (Greek, Ancient, to 1453),
|
|
<strong>guj</strong> (Gujarati),
|
|
<strong>hat</strong> (Haitian; Haitian Creole),
|
|
<strong>heb</strong> (Hebrew),
|
|
<strong>hin</strong> (Hindi),
|
|
<strong>hrv</strong> (Croatian),
|
|
<strong>hun</strong> (Hungarian),
|
|
<strong>hye</strong> (Armenian),
|
|
<strong>iku</strong> (Inuktitut),
|
|
<strong>ind</strong> (Indonesian),
|
|
<strong>isl</strong> (Icelandic),
|
|
<strong>ita</strong> (Italian),
|
|
<strong>ita_old</strong> (Italian - Old),
|
|
<strong>jav</strong> (Javanese),
|
|
<strong>jpn</strong> (Japanese),
|
|
<strong>kan</strong> (Kannada),
|
|
<strong>kat</strong> (Georgian),
|
|
<strong>kat_old</strong> (Georgian - Old),
|
|
<strong>kaz</strong> (Kazakh),
|
|
<strong>khm</strong> (Central Khmer),
|
|
<strong>kir</strong> (Kirghiz; Kyrgyz),
|
|
<strong>kmr</strong> (Kurdish Kurmanji),
|
|
<strong>kor</strong> (Korean),
|
|
<strong>kor_vert</strong> (Korean vertical),
|
|
<strong>lao</strong> (Lao),
|
|
<strong>lat</strong> (Latin),
|
|
<strong>lav</strong> (Latvian),
|
|
<strong>lit</strong> (Lithuanian),
|
|
<strong>ltz</strong> (Luxembourgish),
|
|
<strong>mal</strong> (Malayalam),
|
|
<strong>mar</strong> (Marathi),
|
|
<strong>mkd</strong> (Macedonian),
|
|
<strong>mlt</strong> (Maltese),
|
|
<strong>mon</strong> (Mongolian),
|
|
<strong>mri</strong> (Maori),
|
|
<strong>msa</strong> (Malay),
|
|
<strong>mya</strong> (Burmese),
|
|
<strong>nep</strong> (Nepali),
|
|
<strong>nld</strong> (Dutch; Flemish),
|
|
<strong>nor</strong> (Norwegian),
|
|
<strong>oci</strong> (Occitan post 1500),
|
|
<strong>ori</strong> (Oriya),
|
|
<strong>osd</strong> (Orientation and script detection module),
|
|
<strong>pan</strong> (Panjabi; Punjabi),
|
|
<strong>pol</strong> (Polish),
|
|
<strong>por</strong> (Portuguese),
|
|
<strong>pus</strong> (Pushto; Pashto),
|
|
<strong>que</strong> (Quechua),
|
|
<strong>ron</strong> (Romanian; Moldavian; Moldovan),
|
|
<strong>rus</strong> (Russian),
|
|
<strong>san</strong> (Sanskrit),
|
|
<strong>sin</strong> (Sinhala; Sinhalese),
|
|
<strong>slk</strong> (Slovak),
|
|
<strong>slv</strong> (Slovenian),
|
|
<strong>snd</strong> (Sindhi),
|
|
<strong>spa</strong> (Spanish; Castilian),
|
|
<strong>spa_old</strong> (Spanish; Castilian - Old),
|
|
<strong>sqi</strong> (Albanian),
|
|
<strong>srp</strong> (Serbian),
|
|
<strong>srp_latn</strong> (Serbian - Latin),
|
|
<strong>sun</strong> (Sundanese),
|
|
<strong>swa</strong> (Swahili),
|
|
<strong>swe</strong> (Swedish),
|
|
<strong>syr</strong> (Syriac),
|
|
<strong>tam</strong> (Tamil),
|
|
<strong>tat</strong> (Tatar),
|
|
<strong>tel</strong> (Telugu),
|
|
<strong>tgk</strong> (Tajik),
|
|
<strong>tha</strong> (Thai),
|
|
<strong>tir</strong> (Tigrinya),
|
|
<strong>ton</strong> (Tonga),
|
|
<strong>tur</strong> (Turkish),
|
|
<strong>uig</strong> (Uighur; Uyghur),
|
|
<strong>ukr</strong> (Ukrainian),
|
|
<strong>urd</strong> (Urdu),
|
|
<strong>uzb</strong> (Uzbek),
|
|
<strong>uzb_cyrl</strong> (Uzbek - Cyrilic),
|
|
<strong>vie</strong> (Vietnamese),
|
|
<strong>yid</strong> (Yiddish),
|
|
<strong>yor</strong> (Yoruba)</p></div>
|
|
<div class="paragraph"><p>To use a non-standard language pack named <span class="monospaced">foo.traineddata</span>, set the
|
|
<span class="monospaced">TESSDATA_PREFIX</span> environment variable so the file can be found at
|
|
<span class="monospaced">TESSDATA_PREFIX/tessdata/foo.traineddata</span> and give Tesseract the
|
|
argument <strong>-l</strong> <span class="monospaced">foo</span>.</p></div>
|
|
<div class="paragraph"><p>For Tesseract 4, <span class="monospaced">tessdata_fast</span> includes traineddata files for the
|
|
following scripts:</p></div>
|
|
<div class="paragraph"><p><strong>Arabic</strong>,
|
|
<strong>Armenian</strong>,
|
|
<strong>Bengali</strong>,
|
|
<strong>Canadian_Aboriginal</strong>,
|
|
<strong>Cherokee</strong>,
|
|
<strong>Cyrillic</strong>,
|
|
<strong>Devanagari</strong>,
|
|
<strong>Ethiopic</strong>,
|
|
<strong>Fraktur</strong>,
|
|
<strong>Georgian</strong>,
|
|
<strong>Greek</strong>,
|
|
<strong>Gujarati</strong>,
|
|
<strong>Gurmukhi</strong>,
|
|
<strong>HanS</strong> (Han simplified),
|
|
<strong>HanS_vert</strong> (Han simplified, vertical),
|
|
<strong>HanT</strong> (Han traditional),
|
|
<strong>HanT_vert</strong> (Han traditional, vertical),
|
|
<strong>Hangul</strong>,
|
|
<strong>Hangul_vert</strong> (Hangul vertical),
|
|
<strong>Hebrew</strong>,
|
|
<strong>Japanese</strong>,
|
|
<strong>Japanese_vert</strong> (Japanese vertical),
|
|
<strong>Kannada</strong>,
|
|
<strong>Khmer</strong>,
|
|
<strong>Lao</strong>,
|
|
<strong>Latin</strong>,
|
|
<strong>Malayalam</strong>,
|
|
<strong>Myanmar</strong>,
|
|
<strong>Oriya</strong> (Odia),
|
|
<strong>Sinhala</strong>,
|
|
<strong>Syriac</strong>,
|
|
<strong>Tamil</strong>,
|
|
<strong>Telugu</strong>,
|
|
<strong>Thaana</strong>,
|
|
<strong>Thai</strong>,
|
|
<strong>Tibetan</strong>,
|
|
<strong>Vietnamese</strong>.</p></div>
|
|
<div class="paragraph"><p>The same languages and scripts are available from
|
|
<a href="https://github.com/tesseract-ocr/tessdata_best">https://github.com/tesseract-ocr/tessdata_best</a>.
|
|
<span class="monospaced">tessdata_best</span> provides slow language and script models.
|
|
These models are needed for training. They also can give better OCR results,
|
|
but the recognition takes much more time.</p></div>
|
|
<div class="paragraph"><p>Both <span class="monospaced">tessdata_fast</span> and <span class="monospaced">tessdata_best</span> only support the LSTM OCR engine.</p></div>
|
|
<div class="paragraph"><p>There is a third repository, <a href="https://github.com/tesseract-ocr/tessdata">https://github.com/tesseract-ocr/tessdata</a>,
|
|
with models which support both the Tesseract 3 legacy OCR engine and the
|
|
Tesseract 4 LSTM OCR engine.</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_config_files_and_augmenting_with_user_data">CONFIG FILES AND AUGMENTING WITH USER DATA</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>Tesseract config files consist of lines with parameter-value pairs (space
|
|
separated). The parameters are documented as flags in the source code like
|
|
the following one in tesseractclass.h:</p></div>
|
|
<div class="paragraph"><p><span class="monospaced">STRING_VAR_H(tessedit_char_blacklist, "",
|
|
"Blacklist of chars not to recognize");</span></p></div>
|
|
<div class="paragraph"><p>These parameters may enable or disable various features of the engine, and
|
|
may cause it to load (or not load) various data. For instance, let’s suppose
|
|
you want to OCR in English, but suppress the normal dictionary and load an
|
|
alternative word list and an alternative list of patterns — these two files
|
|
are the most commonly used extra data files.</p></div>
|
|
<div class="paragraph"><p>If your language pack is in <em>/path/to/eng.traineddata</em> and the hocr config
|
|
is in <em>/path/to/configs/hocr</em> then create three new files:</p></div>
|
|
<div class="paragraph"><p><em>/path/to/eng.user-words</em>:</p></div>
|
|
<div class="verseblock">
|
|
<pre class="content">the
|
|
quick
|
|
brown
|
|
fox
|
|
jumped</pre>
|
|
<div class="attribution">
|
|
</div></div>
|
|
<div class="paragraph"><p><em>/path/to/eng.user-patterns</em>:</p></div>
|
|
<div class="verseblock">
|
|
<pre class="content">1-\d\d\d-GOOG-411
|
|
www.\n\\\*.com</pre>
|
|
<div class="attribution">
|
|
</div></div>
|
|
<div class="paragraph"><p><em>/path/to/configs/bazaar</em>:</p></div>
|
|
<div class="verseblock">
|
|
<pre class="content">load_system_dawg F
|
|
load_freq_dawg F
|
|
user_words_suffix user-words
|
|
user_patterns_suffix user-patterns</pre>
|
|
<div class="attribution">
|
|
</div></div>
|
|
<div class="paragraph"><p>Now, if you pass the word <em>bazaar</em> as a <a href="#CONFIGFILE"><em>CONFIGFILE</em></a> to
|
|
Tesseract, Tesseract will not bother loading the system dictionary nor
|
|
the dictionary of frequent words and will load and use the <em>eng.user-words</em>
|
|
and <em>eng.user-patterns</em> files you provided. The former is a simple word list,
|
|
one per line. The format of the latter is documented in <em>dict/trie.h</em>
|
|
on <em>read_pattern_list()</em>.</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_environment_variables">ENVIRONMENT VARIABLES</h2>
|
|
<div class="sectionbody">
|
|
<div class="dlist"><dl>
|
|
<dt class="hdlist1">
|
|
<strong><span class="monospaced">TESSDATA_PREFIX</span></strong>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
If the <span class="monospaced">TESSDATA_PREFIX</span> is set to a path, then that path is used to
|
|
find the <span class="monospaced">tessdata</span> directory with language and script recognition
|
|
models and config files.
|
|
Using <a href="#TESSDATADIR"><strong>--tessdata-dir</strong> <em>PATH</em></a> is the recommended alternative.
|
|
</p>
|
|
</dd>
|
|
<dt class="hdlist1">
|
|
<strong><span class="monospaced">OMP_THREAD_LIMIT</span></strong>
|
|
</dt>
|
|
<dd>
|
|
<p>
|
|
If the <span class="monospaced">tesseract</span> executable was built with multithreading support,
|
|
it will normally use four CPU cores for the OCR process. While this
|
|
can be faster for a single image, it gives bad performance if the host
|
|
computer provides less than four CPU cores or if OCR is made for many images.
|
|
Only a single CPU core is used with <span class="monospaced">OMP_THREAD_LIMIT=1</span>.
|
|
</p>
|
|
</dd>
|
|
</dl></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_history">HISTORY</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>The engine was developed at Hewlett Packard Laboratories Bristol and at
|
|
Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more
|
|
changes made in 1996 to port to Windows, and some C++izing in 1998. A
|
|
lot of the code was written in C, and then some more was written in C++.
|
|
The C++ code makes heavy use of a list system using macros. This predates
|
|
STL, was portable before STL, and is more efficient than STL lists, but has
|
|
the big negative that if you do get a segmentation violation, it is hard to
|
|
debug.</p></div>
|
|
<div class="paragraph"><p>Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability
|
|
to train Tesseract.</p></div>
|
|
<div class="paragraph"><p>Tesseract was included in UNLV’s Fourth Annual Test of OCR Accuracy.
|
|
See <a href="https://github.com/tesseract-ocr/docs/blob/main/AT-1995.pdf">https://github.com/tesseract-ocr/docs/blob/main/AT-1995.pdf</a>.
|
|
Since Tesseract 2.00,
|
|
scripts are now included to allow anyone to reproduce some of these tests.
|
|
See <a href="https://tesseract-ocr.github.io/tessdoc/TestingTesseract.html">https://tesseract-ocr.github.io/tessdoc/TestingTesseract.html</a> for more
|
|
details.</p></div>
|
|
<div class="paragraph"><p>Tesseract 3.00 added a number of new languages, including Chinese, Japanese,
|
|
and Korean. It also introduced a new, single-file based system of managing
|
|
language data.</p></div>
|
|
<div class="paragraph"><p>Tesseract 3.02 added BiDirectional text support, the ability to recognize
|
|
multiple languages in a single image, and improved layout analysis.</p></div>
|
|
<div class="paragraph"><p>Tesseract 4 adds a new neural net (LSTM) based OCR engine which is focused
|
|
on line recognition, but also still supports the legacy Tesseract OCR engine of
|
|
Tesseract 3 which works by recognizing character patterns. Compatibility with
|
|
Tesseract 3 is enabled by <span class="monospaced">--oem 0</span>. This also needs traineddata files which
|
|
support the legacy engine, for example those from the tessdata repository
|
|
(<a href="https://github.com/tesseract-ocr/tessdata">https://github.com/tesseract-ocr/tessdata</a>).</p></div>
|
|
<div class="paragraph"><p>For further details, see the release notes in the Tesseract documentation
|
|
(<a href="https://tesseract-ocr.github.io/tessdoc/ReleaseNotes.html">https://tesseract-ocr.github.io/tessdoc/ReleaseNotes.html</a>).</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_resources">RESOURCES</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>Main web site: <a href="https://github.com/tesseract-ocr">https://github.com/tesseract-ocr</a><br>
|
|
User forum: <a href="https://groups.google.com/g/tesseract-ocr">https://groups.google.com/g/tesseract-ocr</a><br>
|
|
Documentation: <a href="https://tesseract-ocr.github.io/">https://tesseract-ocr.github.io/</a><br>
|
|
Information on training: <a href="https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html">https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html</a></p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_see_also">SEE ALSO</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1),
|
|
shape_training(1), mftraining(1), unicharambigs(5), unicharset(5),
|
|
unicharset_extractor(1), wordlist2dawg(1)</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_author">AUTHOR</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>Tesseract development was led at Hewlett-Packard and Google by Ray Smith.
|
|
The development team has included:</p></div>
|
|
<div class="paragraph"><p>Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger,
|
|
Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke,
|
|
Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle,
|
|
Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel
|
|
Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh
|
|
Lloyd, Shobhit Saxena, and Thomas Kielbus.</p></div>
|
|
<div class="paragraph"><p>For a list of contributors see
|
|
<a href="https://github.com/tesseract-ocr/tesseract/blob/main/AUTHORS">https://github.com/tesseract-ocr/tesseract/blob/main/AUTHORS</a>.</p></div>
|
|
</div>
|
|
</div>
|
|
<div class="sect1">
|
|
<h2 id="_copying">COPYING</h2>
|
|
<div class="sectionbody">
|
|
<div class="paragraph"><p>Licensed under the Apache License, Version 2.0</p></div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div id="footnotes"><hr></div>
|
|
<div id="footer">
|
|
<div id="footer-text">
|
|
Last updated
|
|
2024-11-10 20:33:28 CET
|
|
</div>
|
|
</div>
|
|
</body>
|
|
</html>
|