Untitled
unknown
python
2 years ago
2.6 kB
3
Indexable
baseurl = 'http://www.ituverava.sp.gov.br/arquivo/' url = 'http://www.ituverava.sp.gov.br/arquivo/?id_secao=10&nome_categoria=boletins-informativos&categoria=104&assunto=' browser = RoboBrowser(parser='html.parser') browser.open(url) last_file = os.listdir('boletins')[-1].split('.')[0] last_date = datetime.strptime(last_file, '%Y-%m-%d') done = False i = 1 pasts = 0 #while not done: while pasts < 5: print(f'Página {i}') downs = browser.find(class_='table table-hover').find_all('a') for down in downs: link = down['href'] name = link.split('/')[-1] is_report = False if re.fullmatch(r'boletim\-\d{3}\-\d{2}\-\d{2}\-\d{4}\.(jpeg|jpg|png)', name): fmt = name.split('.')[1] n, d, m, y = name.split('.')[0].split('-')[-4:] y = '2020' if int(n) > 286: y = '2021' if (datetime.strptime(f'{y}-{m}-{d}', '%Y-%m-%d') - last_date).days < 0: pasts += 1 continue name = f'{y}-{m}-{d}.{fmt}' is_report = True elif re.fullmatch(r'boletim\-\d{3}\-\d{2}\-\d{2}\-\d{4}\-\d{1}\.(jpeg|jpg|png)', name): fmt = name.split('.')[1] n, d, m, y = name.split('.')[0].split('-')[-5:-1] y = '2020' if int(n) > 286: y = '2021' if (datetime.strptime(f'{y}-{m}-{d}', '%Y-%m-%d') - last_date).days < 0: pasts += 1 continue name = f'{y}-{m}-{d}.{fmt}' is_report = True elif re.fullmatch(r'boletim\-\d{3}\-\d{2}\-\d{2}\.(jpeg|jpg|png)', name): fmt = name.split('.')[1] n, d, m= name.split('.')[0].split('-')[-3:] y = '2020' if int(n) > 286: y = '2021' if (datetime.strptime(f'{y}-{m}-{d}', '%Y-%m-%d') - last_date).days < 0: pasts += 1 continue name = f'{y}-{m}-{d}.{fmt}' is_report = True if name not in os.listdir('boletins'): pasts = 0 if is_report: r = browser.session.get(link) with open(f'boletins/{name}', 'wb') as f: f.write(r.content) print('-' + name) next_pg = browser.find_all(class_='page-link')[-1] else: pasts += 1 #done = True #break if next_pg.has_attr('aria-label'): if next_pg['aria-label'] == 'Next': browser.open(baseurl+next_pg['href']) i += 1 else: break
Editor is loading...