Free Code For Text Extraction

mail@pastecode.io avatar
unknown
python
7 months ago
1.6 kB
1
Indexable
Never
import pytesseract
from PIL import Image
import cv2
import os
import mimetypes

def validate_image(image_path):
    mimetype = mimetypes.guess_type(image_path)[0]
    if not mimetype or not mimetype.startswith('image'):
        raise ValueError(f"File '{image_path}' is not an image.")

def preprocess_image(image_path):
    if not os.path.isfile(image_path):
        raise FileNotFoundError(f"No such file: '{image_path}'")

    validate_image(image_path)

    img = cv2.imread(image_path)

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
    dilated = cv2.dilate(thresh, kernel)

    return dilated

def format_text(text):
    return ' '.join(text.split())

def extract_text(image_path, Language='en'):
    preprocessed_image = preprocess_image(image_path)

    if preprocessed_image is None:
        raise ValueError(f"Image '{image_path}' is empty after preprocessing.")

    pil_image = Image.fromarray(preprocessed_image)

    # Use Tesseract to recognize text
    # Use a different Tesseract configuration
    custom_config = r'--oem 1 --psm 3'
    extracted_text = pytesseract.image_to_string(pil_image, config=custom_config)

    formatted_text = format_text(extracted_text)

    return formatted_text

def main(image_path):
    extracted_text = extract_text(image_path)
    print(extracted_text)

image_path = "/content/Nocode.png"
main(image_path)
Leave a Comment