web-scraper

 avatar
unknown
python
3 years ago
7.0 kB
10
Indexable
import json
from progress.bar import IncrementalBar
import requests
from io import BytesIO
import os
from PIL import Image, ImageOps
import readchar
import datetime
import answer

base_url = "https://oauth.digiboek.be"


def get_files(book_number, page_number, pages):
    url = base_url + \
        f"/userfiles/static/output/html5/issues/{book_number}/{page_number}/spread.js"
    response = requests.get(url=url)
    data = response.json()

    for child in data['structure']['children']:
        try:
            for item in child['link']:
                if item in ["photoviewer", "video", "sound"]:
                    try:
                        os.mkdir(os.path.join(os.getcwd(), pages))
                    except OSError as error:
                        pass
                    link = child['link'][0][0]['src']
                    name = child['link'][2].strip()
                    for c in "?\\/:*\"<>|,":
                        name = name.replace(c, "")
                    find_dot = link.rfind(".")
                    extension = link[find_dot:]
                    output = f"{pages}\\" + name + extension
                    if extension == ".jpg":
                        get_photo(link, output)
                    elif extension == ".mp4":
                        get_video(link, output)
                    elif extension == ".mp3":
                        get_sound(link, output)
        except KeyError:
            pass


def get_photo(link, output):
    response = requests.get(base_url + link)
    image = Image.open(BytesIO(response.content))
    image.save(output + ".jpg")


def get_video(link, output):
    response = requests.get(base_url + link)
    with open(output, 'wb') as file:
        for chunk in response.iter_content(chunk_size=255):
            if chunk:
                file.write(chunk)


def get_sound(link, output):
    response = requests.get(base_url + link)
    with open(output, 'wb') as file:
        file.write(response.content)


downloadText = """
 ____  _____ _ _ _ _____ __    _____ _____ ____  _____ _____
|    \|     | | | |   | |  |  |     |  _  |    \|   __| __  |
|  |  |  |  | | | | | | |  |__|  |  |     |  |  |   __|    -|
|____/|_____|_____|_|___|_____|_____|__|__|____/|_____|__|__|
"""
os.system('cls')
print(downloadText)
COOKIE = input("What is your cookie? ")
BEGIN_NUMBER = int(input("What is the begin number of the page? "))
END_NUMBER = int(input("What is the end number of the page? "))
MAGAZINE_NUMBER = int(input('What is the magazine number? '))
WEBSITE_URL = "https://oauth.digiboek.be"

try:
    os.mkdir(os.path.join(os.getcwd(), str(MAGAZINE_NUMBER)))
except OSError as error:
    pass

print("Current working directory: {0}".format(os.getcwd()))

# Change the current working directory
os.chdir(os.path.join(os.getcwd(), str(MAGAZINE_NUMBER)))

# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))


payload = ""

session = requests.Session()
res = session.get(
    WEBSITE_URL + f"/userfiles/static/output/html5/issues/{MAGAZINE_NUMBER}/{BEGIN_NUMBER}/spread.js")
data = res.text
jsondata = json.loads(data)

# print(jsondata)

# currentSpread = jsondata['structure']['spread']['spread_id']
amountPages = len(jsondata['structure']['spread']['bg']['hires'])

stepSize = (1 + amountPages % 2)
print(f"Stepsize: {stepSize}")

basePath = os.getcwd()
imgPath = os.path.join(basePath, "images")

start = datetime.datetime.now()

os.system('cls')
print(downloadText)
bar = IncrementalBar('Pages downloaded', max=(
    (END_NUMBER - BEGIN_NUMBER)*(1 + stepSize % 2) + 1), suffix='%(percent)d%% - %(eta)ds')
image_list = []
for currentPage in range(BEGIN_NUMBER, END_NUMBER + 1, stepSize):
    currentPageNumber = (currentPage - BEGIN_NUMBER) * (1 + stepSize % 2)
    # get_files(MAGAZINE_NUMBER, currentPage,
    #           f"{currentPageNumber} - {currentPageNumber + 1}")
    # print(f"{currentPageNumber} - {currentPageNumber + 1}")
    res = session.get(
        WEBSITE_URL + f"/userfiles/static/output/html5/issues/{MAGAZINE_NUMBER}/{currentPage}/spread.js")
    data = res.text
    jsondata = json.loads(data)

    # print(jsondata)

    # currentSpread = jsondata['structure']['spread']['spread_id']
    amountPages = len(jsondata['structure']['spread']['bg']['hires'])

    if amountPages == 1:
        fullPage = jsondata['structure']['spread']['bg']['hires'][0]
        imgDataFull = requests.get(WEBSITE_URL + fullPage).content
        im = Image.open(BytesIO(imgDataFull))

        answer_link = answer.get_answer_link(
            COOKIE, MAGAZINE_NUMBER, currentPage)
        im_answ = Image.open(
            BytesIO(session.get(WEBSITE_URL + answer_link).content))
        edited_im = ImageOps.fit(im_answ, im.size)
        im.paste(edited_im, edited_im)

        imgwidth, imgheight = im.size
        imageLeft = im.crop((0, 0, int(imgwidth / 2), imgheight))
        imageRight = im.crop((int(imgwidth / 2), 0, imgwidth, imgheight))
        if currentPageNumber != 0:
            im2 = imageLeft.convert('RGB')
            image_list.append(im2)
        im2 = imageRight.convert('RGB')
        image_list.append(im2)
        del im
    if amountPages == 2:
        answer_link = answer.get_answer_link(
            COOKIE, MAGAZINE_NUMBER, currentPage)
        im_answ = Image.open(
            BytesIO(session.get(WEBSITE_URL + answer_link).content))
        imgwidth, imgheight = im_answ.size
        im_answ_r = im_answ.crop((int(imgwidth / 2), 0, imgwidth, imgheight))
        im_answ_l = im_answ.crop((0, 0, int(imgwidth / 2), imgheight))

        leftPage = jsondata['structure']['spread']['bg']['hires'][0]
        rightPage = jsondata['structure']['spread']['bg']['hires'][1]
        if currentPageNumber != 0:
            imgDataLeft = requests.get(WEBSITE_URL + leftPage).content
        imgDataRight = requests.get(WEBSITE_URL + rightPage).content

        if currentPageNumber != 0:
            imageLeft = Image.open(BytesIO(imgDataLeft))
            edited_im = ImageOps.fit(im_answ_l, imageLeft.size)
            # edited_im.save(f"{currentPage}.png")
            imageLeft.paste(edited_im, edited_im)
            # imageLeft.save(f"{currentPage}.png")
            im = imageLeft.convert('RGB')
            image_list.append(im)

        imageRight = Image.open(BytesIO(imgDataRight))
        edited_im = ImageOps.fit(im_answ_r, imageRight.size)
        imageRight.paste(edited_im, edited_im)
        im = imageRight.convert('RGB')
        image_list.append(im)
    bar.next(2)

firstImg = image_list[0]
image_list.pop(0)
firstImg.save(basePath + "\\" + f"{MAGAZINE_NUMBER}.pdf",
              save_all=True, append_images=image_list)
bar.finish()
print(f"Time elapsed: {datetime.datetime.now() - start}")
print("Press Any Key To Exit")
k = readchar.readchar()
Editor is loading...