2023-09-08 08:25:59 +00:00
|
|
|
import os
|
|
|
|
import sys
|
2023-09-11 17:02:17 +00:00
|
|
|
import logging
|
|
|
|
|
|
|
|
from logging_config import setup_logging
|
2023-09-08 08:25:59 +00:00
|
|
|
from subprocess import run, CalledProcessError
|
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
2023-09-11 17:35:55 +00:00
|
|
|
from constants import CONVERTED_DIR, TEXT_OCR_FILE, IMAGE_EXTENSIONS
|
2023-09-11 17:02:17 +00:00
|
|
|
|
2023-09-08 08:25:59 +00:00
|
|
|
|
2023-09-11 17:02:17 +00:00
|
|
|
setup_logging()
|
2023-09-08 08:25:59 +00:00
|
|
|
|
|
|
|
|
2023-09-11 17:35:55 +00:00
|
|
|
def is_image_file(path):
|
|
|
|
return any(path.lower().endswith(ext) for ext in IMAGE_EXTENSIONS)
|
|
|
|
|
|
|
|
|
|
|
|
def ensure_directory_exists(directory):
|
|
|
|
if not os.path.exists(directory):
|
|
|
|
os.mkdir(directory)
|
|
|
|
|
|
|
|
|
2023-09-08 08:25:59 +00:00
|
|
|
def convert_image(image_path):
|
2023-09-11 17:02:17 +00:00
|
|
|
logging.info(f"Converting {image_path}...")
|
|
|
|
converted_path = os.path.join(CONVERTED_DIR, os.path.basename(image_path))
|
2023-09-08 08:25:59 +00:00
|
|
|
cmd = [
|
|
|
|
"convert",
|
|
|
|
image_path,
|
|
|
|
"-colorspace", "Gray",
|
|
|
|
"-resize", "300%",
|
|
|
|
"-threshold", "55%",
|
|
|
|
"-type", "Grayscale",
|
|
|
|
converted_path
|
|
|
|
]
|
|
|
|
|
|
|
|
try:
|
|
|
|
run(cmd, check=True)
|
2023-09-11 17:02:17 +00:00
|
|
|
logging.info(f"Converted image output to {converted_path}!")
|
2023-09-08 08:25:59 +00:00
|
|
|
return converted_path
|
|
|
|
except CalledProcessError:
|
2023-09-11 17:02:17 +00:00
|
|
|
logging.info(f"Error converting {image_path} with ImageMagick. Using original for Tesseract.")
|
2023-09-08 08:25:59 +00:00
|
|
|
return image_path
|
|
|
|
|
2023-09-11 17:02:17 +00:00
|
|
|
|
2023-09-08 08:25:59 +00:00
|
|
|
def ocr_image(image_path):
|
2023-09-11 17:02:17 +00:00
|
|
|
logging.info(f"OCR'ing {image_path}...")
|
2023-09-21 11:42:47 +00:00
|
|
|
|
|
|
|
base_name = os.path.basename(image_path)
|
|
|
|
root_name, _ = os.path.splitext(base_name)
|
|
|
|
text_filename = f"{root_name}.txt"
|
|
|
|
|
2023-09-11 17:02:17 +00:00
|
|
|
text_path = os.path.join(CONVERTED_DIR, text_filename)
|
2023-09-08 08:25:59 +00:00
|
|
|
cmd = ["tesseract", image_path, text_path.replace(".txt", "")]
|
|
|
|
try:
|
|
|
|
run(cmd, check=True)
|
2023-09-11 17:02:17 +00:00
|
|
|
logging.info(f"OCRed to {text_path}!")
|
2023-09-08 08:25:59 +00:00
|
|
|
return text_path
|
|
|
|
except CalledProcessError:
|
2023-09-11 17:02:17 +00:00
|
|
|
logging.info(f"Error processing {image_path} with Tesseract. Skipping.")
|
2023-09-08 08:25:59 +00:00
|
|
|
return None
|
|
|
|
|
2023-09-11 17:02:17 +00:00
|
|
|
|
2023-09-08 08:25:59 +00:00
|
|
|
def process_image(image_path):
|
|
|
|
converted_path = convert_image(image_path)
|
2023-09-11 17:02:17 +00:00
|
|
|
logging.info(f"OCR'ing image {image_path} (now at {converted_path})...")
|
2023-09-08 08:25:59 +00:00
|
|
|
text_path = ocr_image(converted_path)
|
|
|
|
if text_path and os.path.exists(text_path):
|
|
|
|
with open(text_path, 'r') as text_file:
|
|
|
|
text_content = text_file.read()
|
2023-09-11 17:02:17 +00:00
|
|
|
logging.info(f"Added text from {text_path} to final output.")
|
2023-09-08 08:25:59 +00:00
|
|
|
return text_content
|
|
|
|
else:
|
2023-09-11 17:02:17 +00:00
|
|
|
logging.info(f"Cannot locate {text_path}! Cannot add text to final output!")
|
2023-09-08 08:25:59 +00:00
|
|
|
return None
|
|
|
|
|
2023-09-11 17:02:17 +00:00
|
|
|
|
2023-09-11 17:35:55 +00:00
|
|
|
def process_images(directory_path):
|
2023-09-08 08:25:59 +00:00
|
|
|
final_text = []
|
|
|
|
|
2023-09-11 17:02:17 +00:00
|
|
|
ensure_directory_exists(CONVERTED_DIR)
|
2023-09-08 08:25:59 +00:00
|
|
|
|
|
|
|
image_paths = []
|
|
|
|
for root, dirs, files in os.walk(directory_path):
|
|
|
|
for file in files:
|
|
|
|
image_path = os.path.join(root, file)
|
|
|
|
if is_image_file(image_path):
|
|
|
|
image_paths.append(image_path)
|
|
|
|
|
|
|
|
# Use a ThreadPoolExecutor to process images in parallel
|
|
|
|
with ThreadPoolExecutor() as executor:
|
|
|
|
final_text = list(executor.map(process_image, image_paths))
|
|
|
|
|
|
|
|
# Filter out any None values and write the text to final.txt
|
|
|
|
final_text = [text for text in final_text if text is not None]
|
2023-09-11 17:35:55 +00:00
|
|
|
with open(TEXT_OCR_FILE, 'w') as f:
|
2023-09-08 08:25:59 +00:00
|
|
|
f.write("\n".join(final_text))
|
|
|
|
|
2023-09-11 17:35:55 +00:00
|
|
|
logging.info(f"All images processed! Final output saved to {TEXT_OCR_FILE}")
|
2023-09-11 16:09:18 +00:00
|
|
|
return final_text # Add this line
|
2023-09-08 09:58:19 +00:00
|
|
|
|
|
|
|
|
2023-09-08 08:25:59 +00:00
|
|
|
if __name__ == "__main__":
|
|
|
|
if len(sys.argv) != 2:
|
|
|
|
print("Usage: python images2text.py <directory_path>")
|
|
|
|
sys.exit(1)
|
2023-09-11 17:35:55 +00:00
|
|
|
process_images(sys.argv[1])
|