Free Code For Text Extraction
unknown
python
2 years ago
1.6 kB
3
Indexable
import pytesseract from PIL import Image import cv2 import os import mimetypes def validate_image(image_path): mimetype = mimetypes.guess_type(image_path)[0] if not mimetype or not mimetype.startswith('image'): raise ValueError(f"File '{image_path}' is not an image.") def preprocess_image(image_path): if not os.path.isfile(image_path): raise FileNotFoundError(f"No such file: '{image_path}'") validate_image(image_path) img = cv2.imread(image_path) gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) blurred = cv2.GaussianBlur(gray, (5, 5), 0) thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1] kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2)) dilated = cv2.dilate(thresh, kernel) return dilated def format_text(text): return ' '.join(text.split()) def extract_text(image_path, Language='en'): preprocessed_image = preprocess_image(image_path) if preprocessed_image is None: raise ValueError(f"Image '{image_path}' is empty after preprocessing.") pil_image = Image.fromarray(preprocessed_image) # Use Tesseract to recognize text # Use a different Tesseract configuration custom_config = r'--oem 1 --psm 3' extracted_text = pytesseract.image_to_string(pil_image, config=custom_config) formatted_text = format_text(extracted_text) return formatted_text def main(image_path): extracted_text = extract_text(image_path) print(extracted_text) image_path = "/content/Nocode.png" main(image_path)
Editor is loading...
Leave a Comment