Untitled

mail@pastecode.io avatarunknown
python
2 months ago
1.2 kB
6
Indexable
Never
#!/usr/bin/python3

import argparse
import pdf2image
import numpy as np
import scipy.ndimage
from scipy.special import comb
from PIL import Image


DPI = 200
S = 60
B = 20


def smoothstep(x):
    x = np.clip(x, 0, 1)
    x2 = x * x
    return 3 * x2 - 2 * x2 * x


def main():
    parser = argparse.ArgumentParser(description="clean up PDF scans")
    parser.add_argument("filename", help="input PDF file")
    parser.add_argument("-o", default="out.pdf", help="specify output PDF file name")
    args = parser.parse_args()

    images = []

    for img in pdf2image.convert_from_path(args.filename, DPI):

        im   = np.array(img)
        gray = np.array(img.convert("L"))

        blur = scipy.ndimage.gaussian_filter(gray, S)
        q = im.astype(np.float32) / blur[..., None]
        im = smoothstep(q) * 255

        # white boarders
        border_mask = np.ones(gray.shape, np.bool_)
        border_mask[B:-B,B:-B] = False
        im[border_mask] = 255

        img = Image.fromarray(im.astype(np.uint8))
        images.append(img)

    images.pop(0).save(args.o, quality=70, save_all=True, append_images=images, resolution=DPI)


if __name__ == "__main__":
    main()