import os import sys import logging from logging_config import setup_logging from subprocess import run, CalledProcessError from concurrent.futures import ThreadPoolExecutor from constants import CONVERTED_DIR, TEXT_OCR_FILE, IMAGE_EXTENSIONS setup_logging() def is_image_file(path): return any(path.lower().endswith(ext) for ext in IMAGE_EXTENSIONS) def ensure_directory_exists(directory): if not os.path.exists(directory): os.mkdir(directory) def convert_image(image_path): logging.info(f"Converting {image_path}...") converted_path = os.path.join(CONVERTED_DIR, os.path.basename(image_path)) cmd = [ "convert", image_path, "-colorspace", "Gray", "-resize", "300%", "-threshold", "55%", "-type", "Grayscale", converted_path ] try: run(cmd, check=True) logging.info(f"Converted image output to {converted_path}!") return converted_path except CalledProcessError: logging.info(f"Error converting {image_path} with ImageMagick. Using original for Tesseract.") return image_path def ocr_image(image_path): logging.info(f"OCR'ing {image_path}...") base_name = os.path.basename(image_path) root_name, _ = os.path.splitext(base_name) text_filename = f"{root_name}.txt" text_path = os.path.join(CONVERTED_DIR, text_filename) cmd = ["tesseract", image_path, text_path.replace(".txt", "")] try: run(cmd, check=True) logging.info(f"OCRed to {text_path}!") return text_path except CalledProcessError: logging.info(f"Error processing {image_path} with Tesseract. Skipping.") return None def process_image(image_path): converted_path = convert_image(image_path) logging.info(f"OCR'ing image {image_path} (now at {converted_path})...") text_path = ocr_image(converted_path) if text_path and os.path.exists(text_path): with open(text_path, 'r') as text_file: text_content = text_file.read() logging.info(f"Added text from {text_path} to final output.") return text_content else: logging.info(f"Cannot locate {text_path}! Cannot add text to final output!") return None def process_images(directory_path): final_text = [] ensure_directory_exists(CONVERTED_DIR) image_paths = [] for root, dirs, files in os.walk(directory_path): for file in files: image_path = os.path.join(root, file) if is_image_file(image_path): image_paths.append(image_path) # Use a ThreadPoolExecutor to process images in parallel with ThreadPoolExecutor() as executor: final_text = list(executor.map(process_image, image_paths)) # Filter out any None values and write the text to final.txt final_text = [text for text in final_text if text is not None] with open(TEXT_OCR_FILE, 'w') as f: f.write("\n".join(final_text)) logging.info(f"All images processed! Final output saved to {TEXT_OCR_FILE}") return final_text # Add this line if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python images2text.py ") sys.exit(1) process_images(sys.argv[1])