web-scraper
unknown
python
3 years ago
7.0 kB
10
Indexable
import json from progress.bar import IncrementalBar import requests from io import BytesIO import os from PIL import Image, ImageOps import readchar import datetime import answer base_url = "https://oauth.digiboek.be" def get_files(book_number, page_number, pages): url = base_url + \ f"/userfiles/static/output/html5/issues/{book_number}/{page_number}/spread.js" response = requests.get(url=url) data = response.json() for child in data['structure']['children']: try: for item in child['link']: if item in ["photoviewer", "video", "sound"]: try: os.mkdir(os.path.join(os.getcwd(), pages)) except OSError as error: pass link = child['link'][0][0]['src'] name = child['link'][2].strip() for c in "?\\/:*\"<>|,": name = name.replace(c, "") find_dot = link.rfind(".") extension = link[find_dot:] output = f"{pages}\\" + name + extension if extension == ".jpg": get_photo(link, output) elif extension == ".mp4": get_video(link, output) elif extension == ".mp3": get_sound(link, output) except KeyError: pass def get_photo(link, output): response = requests.get(base_url + link) image = Image.open(BytesIO(response.content)) image.save(output + ".jpg") def get_video(link, output): response = requests.get(base_url + link) with open(output, 'wb') as file: for chunk in response.iter_content(chunk_size=255): if chunk: file.write(chunk) def get_sound(link, output): response = requests.get(base_url + link) with open(output, 'wb') as file: file.write(response.content) downloadText = """ ____ _____ _ _ _ _____ __ _____ _____ ____ _____ _____ | \| | | | | | | | | | _ | \| __| __ | | | | | | | | | | | | |__| | | | | | __| -| |____/|_____|_____|_|___|_____|_____|__|__|____/|_____|__|__| """ os.system('cls') print(downloadText) COOKIE = input("What is your cookie? ") BEGIN_NUMBER = int(input("What is the begin number of the page? ")) END_NUMBER = int(input("What is the end number of the page? ")) MAGAZINE_NUMBER = int(input('What is the magazine number? ')) WEBSITE_URL = "https://oauth.digiboek.be" try: os.mkdir(os.path.join(os.getcwd(), str(MAGAZINE_NUMBER))) except OSError as error: pass print("Current working directory: {0}".format(os.getcwd())) # Change the current working directory os.chdir(os.path.join(os.getcwd(), str(MAGAZINE_NUMBER))) # Print the current working directory print("Current working directory: {0}".format(os.getcwd())) payload = "" session = requests.Session() res = session.get( WEBSITE_URL + f"/userfiles/static/output/html5/issues/{MAGAZINE_NUMBER}/{BEGIN_NUMBER}/spread.js") data = res.text jsondata = json.loads(data) # print(jsondata) # currentSpread = jsondata['structure']['spread']['spread_id'] amountPages = len(jsondata['structure']['spread']['bg']['hires']) stepSize = (1 + amountPages % 2) print(f"Stepsize: {stepSize}") basePath = os.getcwd() imgPath = os.path.join(basePath, "images") start = datetime.datetime.now() os.system('cls') print(downloadText) bar = IncrementalBar('Pages downloaded', max=( (END_NUMBER - BEGIN_NUMBER)*(1 + stepSize % 2) + 1), suffix='%(percent)d%% - %(eta)ds') image_list = [] for currentPage in range(BEGIN_NUMBER, END_NUMBER + 1, stepSize): currentPageNumber = (currentPage - BEGIN_NUMBER) * (1 + stepSize % 2) # get_files(MAGAZINE_NUMBER, currentPage, # f"{currentPageNumber} - {currentPageNumber + 1}") # print(f"{currentPageNumber} - {currentPageNumber + 1}") res = session.get( WEBSITE_URL + f"/userfiles/static/output/html5/issues/{MAGAZINE_NUMBER}/{currentPage}/spread.js") data = res.text jsondata = json.loads(data) # print(jsondata) # currentSpread = jsondata['structure']['spread']['spread_id'] amountPages = len(jsondata['structure']['spread']['bg']['hires']) if amountPages == 1: fullPage = jsondata['structure']['spread']['bg']['hires'][0] imgDataFull = requests.get(WEBSITE_URL + fullPage).content im = Image.open(BytesIO(imgDataFull)) answer_link = answer.get_answer_link( COOKIE, MAGAZINE_NUMBER, currentPage) im_answ = Image.open( BytesIO(session.get(WEBSITE_URL + answer_link).content)) edited_im = ImageOps.fit(im_answ, im.size) im.paste(edited_im, edited_im) imgwidth, imgheight = im.size imageLeft = im.crop((0, 0, int(imgwidth / 2), imgheight)) imageRight = im.crop((int(imgwidth / 2), 0, imgwidth, imgheight)) if currentPageNumber != 0: im2 = imageLeft.convert('RGB') image_list.append(im2) im2 = imageRight.convert('RGB') image_list.append(im2) del im if amountPages == 2: answer_link = answer.get_answer_link( COOKIE, MAGAZINE_NUMBER, currentPage) im_answ = Image.open( BytesIO(session.get(WEBSITE_URL + answer_link).content)) imgwidth, imgheight = im_answ.size im_answ_r = im_answ.crop((int(imgwidth / 2), 0, imgwidth, imgheight)) im_answ_l = im_answ.crop((0, 0, int(imgwidth / 2), imgheight)) leftPage = jsondata['structure']['spread']['bg']['hires'][0] rightPage = jsondata['structure']['spread']['bg']['hires'][1] if currentPageNumber != 0: imgDataLeft = requests.get(WEBSITE_URL + leftPage).content imgDataRight = requests.get(WEBSITE_URL + rightPage).content if currentPageNumber != 0: imageLeft = Image.open(BytesIO(imgDataLeft)) edited_im = ImageOps.fit(im_answ_l, imageLeft.size) # edited_im.save(f"{currentPage}.png") imageLeft.paste(edited_im, edited_im) # imageLeft.save(f"{currentPage}.png") im = imageLeft.convert('RGB') image_list.append(im) imageRight = Image.open(BytesIO(imgDataRight)) edited_im = ImageOps.fit(im_answ_r, imageRight.size) imageRight.paste(edited_im, edited_im) im = imageRight.convert('RGB') image_list.append(im) bar.next(2) firstImg = image_list[0] image_list.pop(0) firstImg.save(basePath + "\\" + f"{MAGAZINE_NUMBER}.pdf", save_all=True, append_images=image_list) bar.finish() print(f"Time elapsed: {datetime.datetime.now() - start}") print("Press Any Key To Exit") k = readchar.readchar()
Editor is loading...