Free Code For Text Extraction
unknown
python
2 years ago
1.6 kB
7
Indexable
import pytesseract
from PIL import Image
import cv2
import os
import mimetypes
def validate_image(image_path):
mimetype = mimetypes.guess_type(image_path)[0]
if not mimetype or not mimetype.startswith('image'):
raise ValueError(f"File '{image_path}' is not an image.")
def preprocess_image(image_path):
if not os.path.isfile(image_path):
raise FileNotFoundError(f"No such file: '{image_path}'")
validate_image(image_path)
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
blurred = cv2.GaussianBlur(gray, (5, 5), 0)
thresh = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
dilated = cv2.dilate(thresh, kernel)
return dilated
def format_text(text):
return ' '.join(text.split())
def extract_text(image_path, Language='en'):
preprocessed_image = preprocess_image(image_path)
if preprocessed_image is None:
raise ValueError(f"Image '{image_path}' is empty after preprocessing.")
pil_image = Image.fromarray(preprocessed_image)
# Use Tesseract to recognize text
# Use a different Tesseract configuration
custom_config = r'--oem 1 --psm 3'
extracted_text = pytesseract.image_to_string(pil_image, config=custom_config)
formatted_text = format_text(extracted_text)
return formatted_text
def main(image_path):
extracted_text = extract_text(image_path)
print(extracted_text)
image_path = "/content/Nocode.png"
main(image_path)Editor is loading...
Leave a Comment