Untitled

mail@pastecode.io avatar
unknown
python
2 years ago
2.7 kB
2
Indexable
Never
from robobrowser import RoboBrowser
import re
import os

from datetime import datetime

baseurl = 'http://www.ituverava.sp.gov.br/arquivo/'
url = 'http://www.ituverava.sp.gov.br/arquivo/?id_secao=10&nome_categoria=boletins-informativos&categoria=104&assunto='
browser = RoboBrowser(parser='html.parser')
browser.open(url)

last_file = os.listdir('boletins')[-1].split('.')[0]
last_date = datetime.strptime(last_file, '%Y-%m-%d')
done = False
i = 1
pasts = 0
#while not done:
while pasts < 5:
    print(f'Página {i}')
    downs = browser.find(class_='table table-hover').find_all('a')
    for down in downs:
        link = down['href']
        name = link.split('/')[-1]
        is_report = False
        if re.fullmatch(r'boletim\-\d{3}\-\d{2}\-\d{2}\-\d{4}\.(jpeg|jpg|png)', name):
            fmt = name.split('.')[1]
            n, d, m, y = name.split('.')[0].split('-')[-4:]
            y = '2020'
            if int(n) > 286:
                y = '2021'
            if (datetime.strptime(f'{y}-{m}-{d}', '%Y-%m-%d') - last_date).days < 0:
                pasts += 1
                continue
            name = f'{y}-{m}-{d}.{fmt}'
            is_report = True
        elif re.fullmatch(r'boletim\-\d{3}\-\d{2}\-\d{2}\-\d{4}\-\d{1}\.(jpeg|jpg|png)', name):
            fmt = name.split('.')[1]
            n, d, m, y = name.split('.')[0].split('-')[-5:-1]
            y = '2020'
            if int(n) > 286:
                y = '2021'
            if (datetime.strptime(f'{y}-{m}-{d}', '%Y-%m-%d') - last_date).days < 0:
                pasts += 1
                continue
            name = f'{y}-{m}-{d}.{fmt}'
            is_report = True
        elif re.fullmatch(r'boletim\-\d{3}\-\d{2}\-\d{2}\.(jpeg|jpg|png)', name):
            fmt = name.split('.')[1]
            n, d, m= name.split('.')[0].split('-')[-3:]
            y = '2020'
            if int(n) > 286:
                y = '2021'
            if (datetime.strptime(f'{y}-{m}-{d}', '%Y-%m-%d') - last_date).days < 0:
                pasts += 1
                continue
            name = f'{y}-{m}-{d}.{fmt}'
            is_report = True
        if name not in os.listdir('boletins'):
            pasts = 0
            if is_report:
                r = browser.session.get(link)
                with open(f'boletins/{name}', 'wb') as f:
                    f.write(r.content)
                print('-' + name)
                next_pg = browser.find_all(class_='page-link')[-1]
        else:
            pasts += 1
            #done = True
            #break
    
    if next_pg.has_attr('aria-label'):
        if next_pg['aria-label'] == 'Next':
            browser.open(baseurl+next_pg['href'])
            i += 1
    else:
        break