Overhauled Deployment Structure
This commit is contained in:
965
Data/Server/Python_API_Endpoints/Tesseract-OCR/unicharset.5.html
Normal file
965
Data/Server/Python_API_Endpoints/Tesseract-OCR/unicharset.5.html
Normal file
@ -0,0 +1,965 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
||||
<meta name="generator" content="AsciiDoc 10.2.0">
|
||||
<title>UNICHARSET(5)</title>
|
||||
<style type="text/css">
|
||||
/* Shared CSS for AsciiDoc xhtml11 and html5 backends */
|
||||
|
||||
/* Default font. */
|
||||
body {
|
||||
font-family: Georgia,serif;
|
||||
}
|
||||
|
||||
/* Title font. */
|
||||
h1, h2, h3, h4, h5, h6,
|
||||
div.title, caption.title,
|
||||
thead, p.table.header,
|
||||
#toctitle,
|
||||
#author, #revnumber, #revdate, #revremark,
|
||||
#footer {
|
||||
font-family: Arial,Helvetica,sans-serif;
|
||||
}
|
||||
|
||||
body {
|
||||
margin: 1em 5% 1em 5%;
|
||||
}
|
||||
|
||||
a {
|
||||
color: blue;
|
||||
text-decoration: underline;
|
||||
}
|
||||
a:visited {
|
||||
color: fuchsia;
|
||||
}
|
||||
|
||||
em {
|
||||
font-style: italic;
|
||||
color: navy;
|
||||
}
|
||||
|
||||
strong {
|
||||
font-weight: bold;
|
||||
color: #083194;
|
||||
}
|
||||
|
||||
h1, h2, h3, h4, h5, h6 {
|
||||
color: #527bbd;
|
||||
margin-top: 1.2em;
|
||||
margin-bottom: 0.5em;
|
||||
line-height: 1.3;
|
||||
}
|
||||
|
||||
h1, h2, h3 {
|
||||
border-bottom: 2px solid silver;
|
||||
}
|
||||
h2 {
|
||||
padding-top: 0.5em;
|
||||
}
|
||||
h3 {
|
||||
float: left;
|
||||
}
|
||||
h3 + * {
|
||||
clear: left;
|
||||
}
|
||||
h5 {
|
||||
font-size: 1.0em;
|
||||
}
|
||||
|
||||
div.sectionbody {
|
||||
margin-left: 0;
|
||||
}
|
||||
|
||||
hr {
|
||||
border: 1px solid silver;
|
||||
}
|
||||
|
||||
p {
|
||||
margin-top: 0.5em;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
|
||||
ul, ol, li > p {
|
||||
margin-top: 0;
|
||||
}
|
||||
ul > li { color: #aaa; }
|
||||
ul > li > * { color: black; }
|
||||
|
||||
.monospaced, code, pre {
|
||||
font-family: "Courier New", Courier, monospace;
|
||||
font-size: inherit;
|
||||
color: navy;
|
||||
padding: 0;
|
||||
margin: 0;
|
||||
}
|
||||
pre {
|
||||
white-space: pre-wrap;
|
||||
}
|
||||
|
||||
#author {
|
||||
color: #527bbd;
|
||||
font-weight: bold;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
#email {
|
||||
}
|
||||
#revnumber, #revdate, #revremark {
|
||||
}
|
||||
|
||||
#footer {
|
||||
font-size: small;
|
||||
border-top: 2px solid silver;
|
||||
padding-top: 0.5em;
|
||||
margin-top: 4.0em;
|
||||
}
|
||||
#footer-text {
|
||||
float: left;
|
||||
padding-bottom: 0.5em;
|
||||
}
|
||||
#footer-badges {
|
||||
float: right;
|
||||
padding-bottom: 0.5em;
|
||||
}
|
||||
|
||||
#preamble {
|
||||
margin-top: 1.5em;
|
||||
margin-bottom: 1.5em;
|
||||
}
|
||||
div.imageblock, div.exampleblock, div.verseblock,
|
||||
div.quoteblock, div.literalblock, div.listingblock, div.sidebarblock,
|
||||
div.admonitionblock {
|
||||
margin-top: 1.0em;
|
||||
margin-bottom: 1.5em;
|
||||
}
|
||||
div.admonitionblock {
|
||||
margin-top: 2.0em;
|
||||
margin-bottom: 2.0em;
|
||||
margin-right: 10%;
|
||||
color: #606060;
|
||||
}
|
||||
|
||||
div.content { /* Block element content. */
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
/* Block element titles. */
|
||||
div.title, caption.title {
|
||||
color: #527bbd;
|
||||
font-weight: bold;
|
||||
text-align: left;
|
||||
margin-top: 1.0em;
|
||||
margin-bottom: 0.5em;
|
||||
}
|
||||
div.title + * {
|
||||
margin-top: 0;
|
||||
}
|
||||
|
||||
td div.title:first-child {
|
||||
margin-top: 0.0em;
|
||||
}
|
||||
div.content div.title:first-child {
|
||||
margin-top: 0.0em;
|
||||
}
|
||||
div.content + div.title {
|
||||
margin-top: 0.0em;
|
||||
}
|
||||
|
||||
div.sidebarblock > div.content {
|
||||
background: #ffffee;
|
||||
border: 1px solid #dddddd;
|
||||
border-left: 4px solid #f0f0f0;
|
||||
padding: 0.5em;
|
||||
}
|
||||
|
||||
div.listingblock > div.content {
|
||||
border: 1px solid #dddddd;
|
||||
border-left: 5px solid #f0f0f0;
|
||||
background: #f8f8f8;
|
||||
padding: 0.5em;
|
||||
}
|
||||
|
||||
div.quoteblock, div.verseblock {
|
||||
padding-left: 1.0em;
|
||||
margin-left: 1.0em;
|
||||
margin-right: 10%;
|
||||
border-left: 5px solid #f0f0f0;
|
||||
color: #888;
|
||||
}
|
||||
|
||||
div.quoteblock > div.attribution {
|
||||
padding-top: 0.5em;
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
div.verseblock > pre.content {
|
||||
font-family: inherit;
|
||||
font-size: inherit;
|
||||
}
|
||||
div.verseblock > div.attribution {
|
||||
padding-top: 0.75em;
|
||||
text-align: left;
|
||||
}
|
||||
/* DEPRECATED: Pre version 8.2.7 verse style literal block. */
|
||||
div.verseblock + div.attribution {
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
div.admonitionblock .icon {
|
||||
vertical-align: top;
|
||||
font-size: 1.1em;
|
||||
font-weight: bold;
|
||||
text-decoration: underline;
|
||||
color: #527bbd;
|
||||
padding-right: 0.5em;
|
||||
}
|
||||
div.admonitionblock td.content {
|
||||
padding-left: 0.5em;
|
||||
border-left: 3px solid #dddddd;
|
||||
}
|
||||
|
||||
div.exampleblock > div.content {
|
||||
border-left: 3px solid #dddddd;
|
||||
padding-left: 0.5em;
|
||||
}
|
||||
|
||||
div.imageblock div.content { padding-left: 0; }
|
||||
span.image img { border-style: none; vertical-align: text-bottom; }
|
||||
a.image:visited { color: white; }
|
||||
|
||||
dl {
|
||||
margin-top: 0.8em;
|
||||
margin-bottom: 0.8em;
|
||||
}
|
||||
dt {
|
||||
margin-top: 0.5em;
|
||||
margin-bottom: 0;
|
||||
font-style: normal;
|
||||
color: navy;
|
||||
}
|
||||
dd > *:first-child {
|
||||
margin-top: 0.1em;
|
||||
}
|
||||
|
||||
ul, ol {
|
||||
list-style-position: outside;
|
||||
}
|
||||
ol.arabic {
|
||||
list-style-type: decimal;
|
||||
}
|
||||
ol.loweralpha {
|
||||
list-style-type: lower-alpha;
|
||||
}
|
||||
ol.upperalpha {
|
||||
list-style-type: upper-alpha;
|
||||
}
|
||||
ol.lowerroman {
|
||||
list-style-type: lower-roman;
|
||||
}
|
||||
ol.upperroman {
|
||||
list-style-type: upper-roman;
|
||||
}
|
||||
|
||||
div.compact ul, div.compact ol,
|
||||
div.compact p, div.compact p,
|
||||
div.compact div, div.compact div {
|
||||
margin-top: 0.1em;
|
||||
margin-bottom: 0.1em;
|
||||
}
|
||||
|
||||
tfoot {
|
||||
font-weight: bold;
|
||||
}
|
||||
td > div.verse {
|
||||
white-space: pre;
|
||||
}
|
||||
|
||||
div.hdlist {
|
||||
margin-top: 0.8em;
|
||||
margin-bottom: 0.8em;
|
||||
}
|
||||
div.hdlist tr {
|
||||
padding-bottom: 15px;
|
||||
}
|
||||
dt.hdlist1.strong, td.hdlist1.strong {
|
||||
font-weight: bold;
|
||||
}
|
||||
td.hdlist1 {
|
||||
vertical-align: top;
|
||||
font-style: normal;
|
||||
padding-right: 0.8em;
|
||||
color: navy;
|
||||
}
|
||||
td.hdlist2 {
|
||||
vertical-align: top;
|
||||
}
|
||||
div.hdlist.compact tr {
|
||||
margin: 0;
|
||||
padding-bottom: 0;
|
||||
}
|
||||
|
||||
.comment {
|
||||
background: yellow;
|
||||
}
|
||||
|
||||
.footnote, .footnoteref {
|
||||
font-size: 0.8em;
|
||||
}
|
||||
|
||||
span.footnote, span.footnoteref {
|
||||
vertical-align: super;
|
||||
}
|
||||
|
||||
#footnotes {
|
||||
margin: 20px 0 20px 0;
|
||||
padding: 7px 0 0 0;
|
||||
}
|
||||
|
||||
#footnotes div.footnote {
|
||||
margin: 0 0 5px 0;
|
||||
}
|
||||
|
||||
#footnotes hr {
|
||||
border: none;
|
||||
border-top: 1px solid silver;
|
||||
height: 1px;
|
||||
text-align: left;
|
||||
margin-left: 0;
|
||||
width: 20%;
|
||||
min-width: 100px;
|
||||
}
|
||||
|
||||
div.colist td {
|
||||
padding-right: 0.5em;
|
||||
padding-bottom: 0.3em;
|
||||
vertical-align: top;
|
||||
}
|
||||
div.colist td img {
|
||||
margin-top: 0.3em;
|
||||
}
|
||||
|
||||
@media print {
|
||||
#footer-badges { display: none; }
|
||||
}
|
||||
|
||||
#toc {
|
||||
margin-bottom: 2.5em;
|
||||
}
|
||||
|
||||
#toctitle {
|
||||
color: #527bbd;
|
||||
font-size: 1.1em;
|
||||
font-weight: bold;
|
||||
margin-top: 1.0em;
|
||||
margin-bottom: 0.1em;
|
||||
}
|
||||
|
||||
div.toclevel0, div.toclevel1, div.toclevel2, div.toclevel3, div.toclevel4 {
|
||||
margin-top: 0;
|
||||
margin-bottom: 0;
|
||||
}
|
||||
div.toclevel2 {
|
||||
margin-left: 2em;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
div.toclevel3 {
|
||||
margin-left: 4em;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
div.toclevel4 {
|
||||
margin-left: 6em;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
|
||||
span.aqua { color: aqua; }
|
||||
span.black { color: black; }
|
||||
span.blue { color: blue; }
|
||||
span.fuchsia { color: fuchsia; }
|
||||
span.gray { color: gray; }
|
||||
span.green { color: green; }
|
||||
span.lime { color: lime; }
|
||||
span.maroon { color: maroon; }
|
||||
span.navy { color: navy; }
|
||||
span.olive { color: olive; }
|
||||
span.purple { color: purple; }
|
||||
span.red { color: red; }
|
||||
span.silver { color: silver; }
|
||||
span.teal { color: teal; }
|
||||
span.white { color: white; }
|
||||
span.yellow { color: yellow; }
|
||||
|
||||
span.aqua-background { background: aqua; }
|
||||
span.black-background { background: black; }
|
||||
span.blue-background { background: blue; }
|
||||
span.fuchsia-background { background: fuchsia; }
|
||||
span.gray-background { background: gray; }
|
||||
span.green-background { background: green; }
|
||||
span.lime-background { background: lime; }
|
||||
span.maroon-background { background: maroon; }
|
||||
span.navy-background { background: navy; }
|
||||
span.olive-background { background: olive; }
|
||||
span.purple-background { background: purple; }
|
||||
span.red-background { background: red; }
|
||||
span.silver-background { background: silver; }
|
||||
span.teal-background { background: teal; }
|
||||
span.white-background { background: white; }
|
||||
span.yellow-background { background: yellow; }
|
||||
|
||||
span.big { font-size: 2em; }
|
||||
span.small { font-size: 0.6em; }
|
||||
|
||||
span.underline { text-decoration: underline; }
|
||||
span.overline { text-decoration: overline; }
|
||||
span.line-through { text-decoration: line-through; }
|
||||
|
||||
div.unbreakable { page-break-inside: avoid; }
|
||||
|
||||
|
||||
/*
|
||||
* xhtml11 specific
|
||||
*
|
||||
* */
|
||||
|
||||
div.tableblock {
|
||||
margin-top: 1.0em;
|
||||
margin-bottom: 1.5em;
|
||||
}
|
||||
div.tableblock > table {
|
||||
border: 3px solid #527bbd;
|
||||
}
|
||||
thead, p.table.header {
|
||||
font-weight: bold;
|
||||
color: #527bbd;
|
||||
}
|
||||
p.table {
|
||||
margin-top: 0;
|
||||
}
|
||||
/* Because the table frame attribute is overridden by CSS in most browsers. */
|
||||
div.tableblock > table[frame="void"] {
|
||||
border-style: none;
|
||||
}
|
||||
div.tableblock > table[frame="hsides"] {
|
||||
border-left-style: none;
|
||||
border-right-style: none;
|
||||
}
|
||||
div.tableblock > table[frame="vsides"] {
|
||||
border-top-style: none;
|
||||
border-bottom-style: none;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* html5 specific
|
||||
*
|
||||
* */
|
||||
|
||||
table.tableblock {
|
||||
margin-top: 1.0em;
|
||||
margin-bottom: 1.5em;
|
||||
}
|
||||
thead, p.tableblock.header {
|
||||
font-weight: bold;
|
||||
color: #527bbd;
|
||||
}
|
||||
p.tableblock {
|
||||
margin-top: 0;
|
||||
}
|
||||
table.tableblock {
|
||||
border-width: 3px;
|
||||
border-spacing: 0px;
|
||||
border-style: solid;
|
||||
border-color: #527bbd;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
th.tableblock, td.tableblock {
|
||||
border-width: 1px;
|
||||
padding: 4px;
|
||||
border-style: solid;
|
||||
border-color: #527bbd;
|
||||
}
|
||||
|
||||
table.tableblock.frame-topbot {
|
||||
border-left-style: hidden;
|
||||
border-right-style: hidden;
|
||||
}
|
||||
table.tableblock.frame-sides {
|
||||
border-top-style: hidden;
|
||||
border-bottom-style: hidden;
|
||||
}
|
||||
table.tableblock.frame-none {
|
||||
border-style: hidden;
|
||||
}
|
||||
|
||||
th.tableblock.halign-left, td.tableblock.halign-left {
|
||||
text-align: left;
|
||||
}
|
||||
th.tableblock.halign-center, td.tableblock.halign-center {
|
||||
text-align: center;
|
||||
}
|
||||
th.tableblock.halign-right, td.tableblock.halign-right {
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
th.tableblock.valign-top, td.tableblock.valign-top {
|
||||
vertical-align: top;
|
||||
}
|
||||
th.tableblock.valign-middle, td.tableblock.valign-middle {
|
||||
vertical-align: middle;
|
||||
}
|
||||
th.tableblock.valign-bottom, td.tableblock.valign-bottom {
|
||||
vertical-align: bottom;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* manpage specific
|
||||
*
|
||||
* */
|
||||
|
||||
body.manpage h1 {
|
||||
padding-top: 0.5em;
|
||||
padding-bottom: 0.5em;
|
||||
border-top: 2px solid silver;
|
||||
border-bottom: 2px solid silver;
|
||||
}
|
||||
body.manpage h2 {
|
||||
border-style: none;
|
||||
}
|
||||
body.manpage div.sectionbody {
|
||||
margin-left: 3em;
|
||||
}
|
||||
|
||||
@media print {
|
||||
body.manpage div#toc { display: none; }
|
||||
}
|
||||
|
||||
|
||||
</style>
|
||||
<script type="text/javascript">
|
||||
/*<+'])');
|
||||
// Function that scans the DOM tree for header elements (the DOM2
|
||||
// nodeIterator API would be a better technique but not supported by all
|
||||
// browsers).
|
||||
var iterate = function (el) {
|
||||
for (var i = el.firstChild; i != null; i = i.nextSibling) {
|
||||
if (i.nodeType == 1 /* Node.ELEMENT_NODE */) {
|
||||
var mo = re.exec(i.tagName);
|
||||
if (mo && (i.getAttribute("class") || i.getAttribute("className")) != "float") {
|
||||
result[result.length] = new TocEntry(i, getText(i), mo[1]-1);
|
||||
}
|
||||
iterate(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
iterate(el);
|
||||
return result;
|
||||
}
|
||||
|
||||
var toc = document.getElementById("toc");
|
||||
if (!toc) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Delete existing TOC entries in case we're reloading the TOC.
|
||||
var tocEntriesToRemove = [];
|
||||
var i;
|
||||
for (i = 0; i < toc.childNodes.length; i++) {
|
||||
var entry = toc.childNodes[i];
|
||||
if (entry.nodeName.toLowerCase() == 'div'
|
||||
&& entry.getAttribute("class")
|
||||
&& entry.getAttribute("class").match(/^toclevel/))
|
||||
tocEntriesToRemove.push(entry);
|
||||
}
|
||||
for (i = 0; i < tocEntriesToRemove.length; i++) {
|
||||
toc.removeChild(tocEntriesToRemove[i]);
|
||||
}
|
||||
|
||||
// Rebuild TOC entries.
|
||||
var entries = tocEntries(document.getElementById("content"), toclevels);
|
||||
for (var i = 0; i < entries.length; ++i) {
|
||||
var entry = entries[i];
|
||||
if (entry.element.id == "")
|
||||
entry.element.id = "_toc_" + i;
|
||||
var a = document.createElement("a");
|
||||
a.href = "#" + entry.element.id;
|
||||
a.appendChild(document.createTextNode(entry.text));
|
||||
var div = document.createElement("div");
|
||||
div.appendChild(a);
|
||||
div.className = "toclevel" + entry.toclevel;
|
||||
toc.appendChild(div);
|
||||
}
|
||||
if (entries.length == 0)
|
||||
toc.parentNode.removeChild(toc);
|
||||
},
|
||||
|
||||
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
// Footnotes generator
|
||||
/////////////////////////////////////////////////////////////////////
|
||||
|
||||
/* Based on footnote generation code from:
|
||||
* http://www.brandspankingnew.net/archive/2005/07/format_footnote.html
|
||||
*/
|
||||
|
||||
footnotes: function () {
|
||||
// Delete existing footnote entries in case we're reloading the footnodes.
|
||||
var i;
|
||||
var noteholder = document.getElementById("footnotes");
|
||||
if (!noteholder) {
|
||||
return;
|
||||
}
|
||||
var entriesToRemove = [];
|
||||
for (i = 0; i < noteholder.childNodes.length; i++) {
|
||||
var entry = noteholder.childNodes[i];
|
||||
if (entry.nodeName.toLowerCase() == 'div' && entry.getAttribute("class") == "footnote")
|
||||
entriesToRemove.push(entry);
|
||||
}
|
||||
for (i = 0; i < entriesToRemove.length; i++) {
|
||||
noteholder.removeChild(entriesToRemove[i]);
|
||||
}
|
||||
|
||||
// Rebuild footnote entries.
|
||||
var cont = document.getElementById("content");
|
||||
var spans = cont.getElementsByTagName("span");
|
||||
var refs = {};
|
||||
var n = 0;
|
||||
for (i=0; i<spans.length; i++) {
|
||||
if (spans[i].className == "footnote") {
|
||||
n++;
|
||||
var note = spans[i].getAttribute("data-note");
|
||||
if (!note) {
|
||||
// Use [\s\S] in place of . so multi-line matches work.
|
||||
// Because JavaScript has no s (dotall) regex flag.
|
||||
note = spans[i].innerHTML.match(/\s*\[([\s\S]*)]\s*/)[1];
|
||||
spans[i].innerHTML =
|
||||
"[<a id='_footnoteref_" + n + "' href='#_footnote_" + n +
|
||||
"' title='View footnote' class='footnote'>" + n + "</a>]";
|
||||
spans[i].setAttribute("data-note", note);
|
||||
}
|
||||
noteholder.innerHTML +=
|
||||
"<div class='footnote' id='_footnote_" + n + "'>" +
|
||||
"<a href='#_footnoteref_" + n + "' title='Return to text'>" +
|
||||
n + "</a>. " + note + "</div>";
|
||||
var id =spans[i].getAttribute("id");
|
||||
if (id != null) refs["#"+id] = n;
|
||||
}
|
||||
}
|
||||
if (n == 0)
|
||||
noteholder.parentNode.removeChild(noteholder);
|
||||
else {
|
||||
// Process footnoterefs.
|
||||
for (i=0; i<spans.length; i++) {
|
||||
if (spans[i].className == "footnoteref") {
|
||||
var href = spans[i].getElementsByTagName("a")[0].getAttribute("href");
|
||||
href = href.match(/#.*/)[0]; // Because IE return full URL.
|
||||
n = refs[href];
|
||||
spans[i].innerHTML =
|
||||
"[<a href='#_footnote_" + n +
|
||||
"' title='View footnote' class='footnote'>" + n + "</a>]";
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
install: function(toclevels) {
|
||||
var timerId;
|
||||
|
||||
function reinstall() {
|
||||
asciidoc.footnotes();
|
||||
if (toclevels) {
|
||||
asciidoc.toc(toclevels);
|
||||
}
|
||||
}
|
||||
|
||||
function reinstallAndRemoveTimer() {
|
||||
clearInterval(timerId);
|
||||
reinstall();
|
||||
}
|
||||
|
||||
timerId = setInterval(reinstall, 500);
|
||||
if (document.addEventListener)
|
||||
document.addEventListener("DOMContentLoaded", reinstallAndRemoveTimer, false);
|
||||
else
|
||||
window.onload = reinstallAndRemoveTimer;
|
||||
}
|
||||
|
||||
}
|
||||
asciidoc.install();
|
||||
/*]]>*/
|
||||
</script>
|
||||
</head>
|
||||
<body class="manpage">
|
||||
<div id="header">
|
||||
<h1>
|
||||
UNICHARSET(5) Manual Page
|
||||
</h1>
|
||||
<h2>NAME</h2>
|
||||
<div class="sectionbody">
|
||||
<p>unicharset -
|
||||
character properties file used by tesseract(1)
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
<div id="content">
|
||||
<div class="sect1">
|
||||
<h2 id="_description">DESCRIPTION</h2>
|
||||
<div class="sectionbody">
|
||||
<div class="paragraph"><p>Tesseract’s unicharset file contains information on each symbol
|
||||
(unichar) the Tesseract OCR engine is trained to recognize.</p></div>
|
||||
<div class="paragraph"><p>A unicharset file (i.e. <em>eng.unicharset</em>) is distributed as part of a
|
||||
Tesseract language pack (i.e. <em>eng.traineddata</em>). For information on
|
||||
extracting the unicharset file, see combine_tessdata(1).</p></div>
|
||||
<div class="paragraph"><p>The first line of a unicharset file contains the number of unichars in
|
||||
the file. After this line, each subsequent line provides information for
|
||||
a single unichar. The first such line contains a placeholder reserved for
|
||||
the space character. Each unichar is referred to within Tesseract by its
|
||||
Unichar ID, which is the line number (minus 1) within the unicharset file.
|
||||
Therefore, space gets unichar 0.</p></div>
|
||||
<div class="paragraph"><p>Each unichar line in the unicharset file (v2+) may have four space-separated fields:</p></div>
|
||||
<div class="literalblock">
|
||||
<div class="content monospaced">
|
||||
<pre>'character' 'properties' 'script' 'id'</pre>
|
||||
</div></div>
|
||||
<div class="paragraph"><p>Starting with Tesseract v3.02, more information may be given for each unichar:</p></div>
|
||||
<div class="literalblock">
|
||||
<div class="content monospaced">
|
||||
<pre>'character' 'properties' 'glyph_metrics' 'script' 'other_case' 'direction' 'mirror' 'normed_form'</pre>
|
||||
</div></div>
|
||||
<div class="paragraph"><p>Entries:</p></div>
|
||||
<div class="dlist"><dl>
|
||||
<dt class="hdlist1">
|
||||
<em>character</em>
|
||||
</dt>
|
||||
<dd>
|
||||
<p>
|
||||
The UTF-8 encoded string to be produced for this unichar.
|
||||
</p>
|
||||
</dd>
|
||||
<dt class="hdlist1">
|
||||
<em>properties</em>
|
||||
</dt>
|
||||
<dd>
|
||||
<p>
|
||||
An integer mask of character properties, one per bit.
|
||||
From least to most significant bit, these are: isalpha, islower, isupper,
|
||||
isdigit, ispunctuation.
|
||||
</p>
|
||||
</dd>
|
||||
<dt class="hdlist1">
|
||||
<em>glyph_metrics</em>
|
||||
</dt>
|
||||
<dd>
|
||||
<p>
|
||||
Ten comma-separated integers representing various standards
|
||||
for where this glyph is to be found within a baseline-normalized coordinate
|
||||
system where 128 is normalized to x-height.
|
||||
</p>
|
||||
<div class="ulist"><ul>
|
||||
<li>
|
||||
<p>
|
||||
min_bottom, max_bottom: the ranges where the bottom of the character can
|
||||
be found.
|
||||
</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>
|
||||
min_top, max_top: the ranges where the top of the character may be found.
|
||||
</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>
|
||||
min_width, max_width: horizontal width of the character.
|
||||
</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>
|
||||
min_bearing, max_bearing: how far from the usual start position does the
|
||||
leftmost part of the character begin.
|
||||
</p>
|
||||
</li>
|
||||
<li>
|
||||
<p>
|
||||
min_advance, max_advance: how far from the printer’s cell left do we
|
||||
advance to begin the next character.
|
||||
</p>
|
||||
</li>
|
||||
</ul></div>
|
||||
</dd>
|
||||
<dt class="hdlist1">
|
||||
<em>script</em>
|
||||
</dt>
|
||||
<dd>
|
||||
<p>
|
||||
Name of the script (Latin, Common, Greek, Cyrillic, Han, null).
|
||||
</p>
|
||||
</dd>
|
||||
<dt class="hdlist1">
|
||||
<em>other_case</em>
|
||||
</dt>
|
||||
<dd>
|
||||
<p>
|
||||
The Unichar ID of the other case version of this character
|
||||
(upper or lower).
|
||||
</p>
|
||||
</dd>
|
||||
<dt class="hdlist1">
|
||||
<em>direction</em>
|
||||
</dt>
|
||||
<dd>
|
||||
<p>
|
||||
The Unicode BiDi direction of this character, as defined by
|
||||
ICU’s enum UCharDirection. (0 = Left to Right, 1 = Right to Left,
|
||||
2 = European Number…)
|
||||
</p>
|
||||
</dd>
|
||||
<dt class="hdlist1">
|
||||
<em>mirror</em>
|
||||
</dt>
|
||||
<dd>
|
||||
<p>
|
||||
The Unichar ID of the BiDirectional mirror of this character.
|
||||
For example the mirror of open paren is close paren, but Latin Capital C
|
||||
has no mirror, so it remains a Latin Capital C.
|
||||
</p>
|
||||
</dd>
|
||||
<dt class="hdlist1">
|
||||
<em>normed_form</em>
|
||||
</dt>
|
||||
<dd>
|
||||
<p>
|
||||
The UTF-8 representation of a "normalized form" of this unichar
|
||||
for the purpose of blaming a module for errors given ground truth text.
|
||||
For instance, a left or right single quote may normalize to an ASCII quote.
|
||||
</p>
|
||||
</dd>
|
||||
</dl></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="sect1">
|
||||
<h2 id="_example_v2">EXAMPLE (v2)</h2>
|
||||
<div class="sectionbody">
|
||||
<div class="literalblock">
|
||||
<div class="content monospaced">
|
||||
<pre>; 10 Common 46
|
||||
b 3 Latin 59
|
||||
W 5 Latin 40
|
||||
7 8 Common 66
|
||||
= 0 Common 93</pre>
|
||||
</div></div>
|
||||
<div class="paragraph"><p>";" is a punctuation character. Its properties are thus represented by the
|
||||
binary number 10000 (10 in hexadecimal).</p></div>
|
||||
<div class="paragraph"><p>"b" is an alphabetic character and a lower case character. Its properties are
|
||||
thus represented by the binary number 00011 (3 in hexadecimal).</p></div>
|
||||
<div class="paragraph"><p>"W" is an alphabetic character and an upper case character. Its properties are
|
||||
thus represented by the binary number 00101 (5 in hexadecimal).</p></div>
|
||||
<div class="paragraph"><p>"7" is just a digit. Its properties are thus represented by the binary number
|
||||
01000 (8 in hexadecimal).</p></div>
|
||||
<div class="paragraph"><p>"=" is not punctuation nor a digit nor an alphabetic character. Its properties
|
||||
are thus represented by the binary number 00000 (0 in hexadecimal).</p></div>
|
||||
<div class="paragraph"><p>Japanese or Chinese alphabetic character properties are represented by the
|
||||
binary number 00001 (1 in hexadecimal): they are alphabetic, but neither
|
||||
upper nor lower case.</p></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="sect1">
|
||||
<h2 id="_example_v3_02">EXAMPLE (v3.02)</h2>
|
||||
<div class="sectionbody">
|
||||
<div class="literalblock">
|
||||
<div class="content monospaced">
|
||||
<pre>110
|
||||
NULL 0 NULL 0
|
||||
N 5 59,68,216,255,87,236,0,27,104,227 Latin 11 0 1 N
|
||||
Y 5 59,68,216,255,91,205,0,47,91,223 Latin 33 0 2 Y
|
||||
1 8 59,69,203,255,45,128,0,66,74,173 Common 3 2 3 1
|
||||
9 8 18,66,203,255,89,156,0,39,104,173 Common 4 2 4 9
|
||||
a 3 58,65,186,198,85,164,0,26,97,185 Latin 56 0 5 a
|
||||
. . .</pre>
|
||||
</div></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="sect1">
|
||||
<h2 id="_caveats">CAVEATS</h2>
|
||||
<div class="sectionbody">
|
||||
<div class="paragraph"><p>Although the unicharset reader maintains the ability to read unicharsets
|
||||
of older formats and will assign default values to missing fields,
|
||||
the accuracy will be degraded.</p></div>
|
||||
<div class="paragraph"><p>Further, most other data files are indexed by the unicharset file,
|
||||
so changing it without re-generating the others is likely to have dire
|
||||
consequences.</p></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="sect1">
|
||||
<h2 id="_history">HISTORY</h2>
|
||||
<div class="sectionbody">
|
||||
<div class="paragraph"><p>The unicharset format first appeared with Tesseract 2.00, which was the
|
||||
first version to support languages other than English. The unicharset file
|
||||
contained only the first two fields, and the "ispunctuation" property was
|
||||
absent (punctuation was regarded as "0", as "=" is in the above example.</p></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="sect1">
|
||||
<h2 id="_see_also">SEE ALSO</h2>
|
||||
<div class="sectionbody">
|
||||
<div class="paragraph"><p>tesseract(1), combine_tessdata(1), unicharset_extractor(1)</p></div>
|
||||
<div class="paragraph"><p><a href="https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html">https://tesseract-ocr.github.io/tessdoc/Training-Tesseract.html</a></p></div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="sect1">
|
||||
<h2 id="_author">AUTHOR</h2>
|
||||
<div class="sectionbody">
|
||||
<div class="paragraph"><p>The Tesseract OCR engine was written by Ray Smith and his research groups
|
||||
at Hewlett Packard (1985-1995) and Google (2006-2018).</p></div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div id="footnotes"><hr></div>
|
||||
<div id="footer">
|
||||
<div id="footer-text">
|
||||
Last updated
|
||||
2024-05-19 13:04:22 CEST
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
Reference in New Issue
Block a user