anki-csv2ankicards/image_processing.py

import os
import sys
import logging

from logging_config import setup_logging
from subprocess import run, CalledProcessError
from concurrent.futures import ThreadPoolExecutor
from constants import CONVERTED_DIR, TEXT_OCR_FILE, IMAGE_EXTENSIONS


setup_logging()


def is_image_file(path):
    return any(path.lower().endswith(ext) for ext in IMAGE_EXTENSIONS)


def ensure_directory_exists(directory):
    if not os.path.exists(directory):
        os.mkdir(directory)


def convert_image(image_path):
    logging.info(f"Converting {image_path}...")
    converted_path = os.path.join(CONVERTED_DIR, os.path.basename(image_path))
    cmd = [
        "convert",
        image_path,
        "-colorspace", "Gray",
        "-resize", "300%",
        "-threshold", "55%",
        "-type", "Grayscale",
        converted_path
    ]

    try:
        run(cmd, check=True)
        logging.info(f"Converted image output to {converted_path}!")
        return converted_path
    except CalledProcessError:
        logging.info(f"Error converting {image_path} with ImageMagick. Using original for Tesseract.")
        return image_path


def ocr_image(image_path):
    logging.info(f"OCR'ing {image_path}...")

    base_name = os.path.basename(image_path)
    root_name, _ = os.path.splitext(base_name)
    text_filename = f"{root_name}.txt"

    text_path = os.path.join(CONVERTED_DIR, text_filename)
    cmd = ["tesseract", image_path, text_path.replace(".txt", "")]
    try:
        run(cmd, check=True)
        logging.info(f"OCRed to {text_path}!")
        return text_path
    except CalledProcessError:
        logging.info(f"Error processing {image_path} with Tesseract. Skipping.")
        return None


def process_image(image_path):
    converted_path = convert_image(image_path)
    logging.info(f"OCR'ing image {image_path} (now at {converted_path})...")
    text_path = ocr_image(converted_path)
    if text_path and os.path.exists(text_path):
        with open(text_path, 'r') as text_file:
            text_content = text_file.read()
            logging.info(f"Added text from {text_path} to final output.")
            return text_content
    else:
        logging.info(f"Cannot locate {text_path}! Cannot add text to final output!")
        return None


def process_images(directory_path):
    final_text = []

    ensure_directory_exists(CONVERTED_DIR)

    image_paths = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            image_path = os.path.join(root, file)
            if is_image_file(image_path):
                image_paths.append(image_path)

    # Use a ThreadPoolExecutor to process images in parallel
    with ThreadPoolExecutor() as executor:
        final_text = list(executor.map(process_image, image_paths))

    # Filter out any None values and write the text to final.txt
    final_text = [text for text in final_text if text is not None]
    with open(TEXT_OCR_FILE, 'w') as f:
        f.write("\n".join(final_text))

    logging.info(f"All images processed! Final output saved to {TEXT_OCR_FILE}")
    return final_text  # Add this line


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python images2text.py <directory_path>")
        sys.exit(1)
    process_images(sys.argv[1])