Added Linux OCR Dependencies

This commit is contained in:
2025-04-13 03:18:13 -06:00
parent c741d81a45
commit e5fedab25b
2 changed files with 20 additions and 13 deletions

View File

@ -5,21 +5,28 @@ import torch
import pytesseract import pytesseract
import easyocr import easyocr
import numpy as np import numpy as np
import platform
from PIL import Image from PIL import Image
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
# Configure internal Tesseract path # Configure cross-platform Tesseract path
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) BASE_DIR = os.path.dirname(os.path.abspath(__file__))
TESSERACT_FOLDER = os.path.join(BASE_DIR, "Tesseract-OCR") SYSTEM = platform.system()
TESSERACT_EXE = os.path.join(TESSERACT_FOLDER, "tesseract.exe")
TESSDATA_DIR = os.path.join(TESSERACT_FOLDER, "tessdata")
if not os.path.isfile(TESSERACT_EXE): if SYSTEM == "Windows":
raise EnvironmentError("Missing tesseract.exe in /Tesseract-OCR. Ensure the full folder is copied.") TESSERACT_FOLDER = os.path.join(BASE_DIR, "Tesseract-OCR")
TESSERACT_EXE = os.path.join(TESSERACT_FOLDER, "tesseract.exe")
TESSDATA_DIR = os.path.join(TESSERACT_FOLDER, "tessdata")
pytesseract.pytesseract.tesseract_cmd = TESSERACT_EXE if not os.path.isfile(TESSERACT_EXE):
os.environ["TESSDATA_PREFIX"] = TESSDATA_DIR raise EnvironmentError("Missing tesseract.exe in /Tesseract-OCR. Ensure the full folder is copied.")
pytesseract.pytesseract.tesseract_cmd = TESSERACT_EXE
os.environ["TESSDATA_PREFIX"] = TESSDATA_DIR
else:
# Assume Linux/macOS with system-installed Tesseract
pytesseract.pytesseract.tesseract_cmd = "tesseract"
# --------------------------------------------------------------------- # ---------------------------------------------------------------------
# EasyOCR Global Instances # EasyOCR Global Instances
@ -54,7 +61,7 @@ def run_ocr_on_base64(image_b64: str, engine: str = "tesseract", backend: str =
try: try:
text = pytesseract.image_to_string(image, config="--psm 6 --oem 1") text = pytesseract.image_to_string(image, config="--psm 6 --oem 1")
except pytesseract.TesseractNotFoundError: except pytesseract.TesseractNotFoundError:
raise RuntimeError("Tesseract binary not found in internal folder.") raise RuntimeError("Tesseract binary not found or not available on this platform.")
elif engine == "easyocr": elif engine == "easyocr":
initialize_ocr_engines() initialize_ocr_engines()
reader = easyocr_reader_gpu if backend == "gpu" else easyocr_reader_cpu reader = easyocr_reader_gpu if backend == "gpu" else easyocr_reader_cpu
@ -82,4 +89,4 @@ def run_ocr_on_base64(image_b64: str, engine: str = "tesseract", backend: str =
else: else:
raise ValueError(f"OCR engine '{engine}' not recognized.") raise ValueError(f"OCR engine '{engine}' not recognized.")
return [line.strip() for line in text.splitlines() if line.strip()] return [line.strip() for line in text.splitlines() if line.strip()]

View File

@ -42,13 +42,13 @@ install_core_dependencies() {
case "$DISTRO_ID" in case "$DISTRO_ID" in
ubuntu|debian) ubuntu|debian)
sudo apt update -qq sudo apt update -qq
sudo apt install -y python3 python3-venv python3-pip nodejs npm git curl sudo apt install -y python3 python3-venv python3-pip nodejs npm git curl tesseract-ocr
;; ;;
rhel|centos|fedora|rocky) rhel|centos|fedora|rocky)
sudo dnf install -y python3 python3-pip nodejs npm git curl sudo dnf install -y python3 python3-pip nodejs npm git curl tesseract
;; ;;
arch) arch)
sudo pacman -Sy --noconfirm python python-venv python-pip nodejs npm git curl sudo pacman -Sy --noconfirm python python-venv python-pip nodejs npm git curl tesseract
;; ;;
*) *)
echo -e "${RED}${CROSSMARK} Unsupported Linux distribution: ${DISTRO_ID}${RESET}" echo -e "${RED}${CROSSMARK} Unsupported Linux distribution: ${DISTRO_ID}${RESET}"