web-scraper
unknown
python
3 years ago
7.0 kB
14
Indexable
import json
from progress.bar import IncrementalBar
import requests
from io import BytesIO
import os
from PIL import Image, ImageOps
import readchar
import datetime
import answer
base_url = "https://oauth.digiboek.be"
def get_files(book_number, page_number, pages):
url = base_url + \
f"/userfiles/static/output/html5/issues/{book_number}/{page_number}/spread.js"
response = requests.get(url=url)
data = response.json()
for child in data['structure']['children']:
try:
for item in child['link']:
if item in ["photoviewer", "video", "sound"]:
try:
os.mkdir(os.path.join(os.getcwd(), pages))
except OSError as error:
pass
link = child['link'][0][0]['src']
name = child['link'][2].strip()
for c in "?\\/:*\"<>|,":
name = name.replace(c, "")
find_dot = link.rfind(".")
extension = link[find_dot:]
output = f"{pages}\\" + name + extension
if extension == ".jpg":
get_photo(link, output)
elif extension == ".mp4":
get_video(link, output)
elif extension == ".mp3":
get_sound(link, output)
except KeyError:
pass
def get_photo(link, output):
response = requests.get(base_url + link)
image = Image.open(BytesIO(response.content))
image.save(output + ".jpg")
def get_video(link, output):
response = requests.get(base_url + link)
with open(output, 'wb') as file:
for chunk in response.iter_content(chunk_size=255):
if chunk:
file.write(chunk)
def get_sound(link, output):
response = requests.get(base_url + link)
with open(output, 'wb') as file:
file.write(response.content)
downloadText = """
____ _____ _ _ _ _____ __ _____ _____ ____ _____ _____
| \| | | | | | | | | | _ | \| __| __ |
| | | | | | | | | | | |__| | | | | | __| -|
|____/|_____|_____|_|___|_____|_____|__|__|____/|_____|__|__|
"""
os.system('cls')
print(downloadText)
COOKIE = input("What is your cookie? ")
BEGIN_NUMBER = int(input("What is the begin number of the page? "))
END_NUMBER = int(input("What is the end number of the page? "))
MAGAZINE_NUMBER = int(input('What is the magazine number? '))
WEBSITE_URL = "https://oauth.digiboek.be"
try:
os.mkdir(os.path.join(os.getcwd(), str(MAGAZINE_NUMBER)))
except OSError as error:
pass
print("Current working directory: {0}".format(os.getcwd()))
# Change the current working directory
os.chdir(os.path.join(os.getcwd(), str(MAGAZINE_NUMBER)))
# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))
payload = ""
session = requests.Session()
res = session.get(
WEBSITE_URL + f"/userfiles/static/output/html5/issues/{MAGAZINE_NUMBER}/{BEGIN_NUMBER}/spread.js")
data = res.text
jsondata = json.loads(data)
# print(jsondata)
# currentSpread = jsondata['structure']['spread']['spread_id']
amountPages = len(jsondata['structure']['spread']['bg']['hires'])
stepSize = (1 + amountPages % 2)
print(f"Stepsize: {stepSize}")
basePath = os.getcwd()
imgPath = os.path.join(basePath, "images")
start = datetime.datetime.now()
os.system('cls')
print(downloadText)
bar = IncrementalBar('Pages downloaded', max=(
(END_NUMBER - BEGIN_NUMBER)*(1 + stepSize % 2) + 1), suffix='%(percent)d%% - %(eta)ds')
image_list = []
for currentPage in range(BEGIN_NUMBER, END_NUMBER + 1, stepSize):
currentPageNumber = (currentPage - BEGIN_NUMBER) * (1 + stepSize % 2)
# get_files(MAGAZINE_NUMBER, currentPage,
# f"{currentPageNumber} - {currentPageNumber + 1}")
# print(f"{currentPageNumber} - {currentPageNumber + 1}")
res = session.get(
WEBSITE_URL + f"/userfiles/static/output/html5/issues/{MAGAZINE_NUMBER}/{currentPage}/spread.js")
data = res.text
jsondata = json.loads(data)
# print(jsondata)
# currentSpread = jsondata['structure']['spread']['spread_id']
amountPages = len(jsondata['structure']['spread']['bg']['hires'])
if amountPages == 1:
fullPage = jsondata['structure']['spread']['bg']['hires'][0]
imgDataFull = requests.get(WEBSITE_URL + fullPage).content
im = Image.open(BytesIO(imgDataFull))
answer_link = answer.get_answer_link(
COOKIE, MAGAZINE_NUMBER, currentPage)
im_answ = Image.open(
BytesIO(session.get(WEBSITE_URL + answer_link).content))
edited_im = ImageOps.fit(im_answ, im.size)
im.paste(edited_im, edited_im)
imgwidth, imgheight = im.size
imageLeft = im.crop((0, 0, int(imgwidth / 2), imgheight))
imageRight = im.crop((int(imgwidth / 2), 0, imgwidth, imgheight))
if currentPageNumber != 0:
im2 = imageLeft.convert('RGB')
image_list.append(im2)
im2 = imageRight.convert('RGB')
image_list.append(im2)
del im
if amountPages == 2:
answer_link = answer.get_answer_link(
COOKIE, MAGAZINE_NUMBER, currentPage)
im_answ = Image.open(
BytesIO(session.get(WEBSITE_URL + answer_link).content))
imgwidth, imgheight = im_answ.size
im_answ_r = im_answ.crop((int(imgwidth / 2), 0, imgwidth, imgheight))
im_answ_l = im_answ.crop((0, 0, int(imgwidth / 2), imgheight))
leftPage = jsondata['structure']['spread']['bg']['hires'][0]
rightPage = jsondata['structure']['spread']['bg']['hires'][1]
if currentPageNumber != 0:
imgDataLeft = requests.get(WEBSITE_URL + leftPage).content
imgDataRight = requests.get(WEBSITE_URL + rightPage).content
if currentPageNumber != 0:
imageLeft = Image.open(BytesIO(imgDataLeft))
edited_im = ImageOps.fit(im_answ_l, imageLeft.size)
# edited_im.save(f"{currentPage}.png")
imageLeft.paste(edited_im, edited_im)
# imageLeft.save(f"{currentPage}.png")
im = imageLeft.convert('RGB')
image_list.append(im)
imageRight = Image.open(BytesIO(imgDataRight))
edited_im = ImageOps.fit(im_answ_r, imageRight.size)
imageRight.paste(edited_im, edited_im)
im = imageRight.convert('RGB')
image_list.append(im)
bar.next(2)
firstImg = image_list[0]
image_list.pop(0)
firstImg.save(basePath + "\\" + f"{MAGAZINE_NUMBER}.pdf",
save_all=True, append_images=image_list)
bar.finish()
print(f"Time elapsed: {datetime.datetime.now() - start}")
print("Press Any Key To Exit")
k = readchar.readchar()
Editor is loading...