Untitled
unknown
python
3 years ago
2.6 kB
4
Indexable
baseurl = 'http://www.ituverava.sp.gov.br/arquivo/'
url = 'http://www.ituverava.sp.gov.br/arquivo/?id_secao=10&nome_categoria=boletins-informativos&categoria=104&assunto='
browser = RoboBrowser(parser='html.parser')
browser.open(url)
last_file = os.listdir('boletins')[-1].split('.')[0]
last_date = datetime.strptime(last_file, '%Y-%m-%d')
done = False
i = 1
pasts = 0
#while not done:
while pasts < 5:
print(f'Página {i}')
downs = browser.find(class_='table table-hover').find_all('a')
for down in downs:
link = down['href']
name = link.split('/')[-1]
is_report = False
if re.fullmatch(r'boletim\-\d{3}\-\d{2}\-\d{2}\-\d{4}\.(jpeg|jpg|png)', name):
fmt = name.split('.')[1]
n, d, m, y = name.split('.')[0].split('-')[-4:]
y = '2020'
if int(n) > 286:
y = '2021'
if (datetime.strptime(f'{y}-{m}-{d}', '%Y-%m-%d') - last_date).days < 0:
pasts += 1
continue
name = f'{y}-{m}-{d}.{fmt}'
is_report = True
elif re.fullmatch(r'boletim\-\d{3}\-\d{2}\-\d{2}\-\d{4}\-\d{1}\.(jpeg|jpg|png)', name):
fmt = name.split('.')[1]
n, d, m, y = name.split('.')[0].split('-')[-5:-1]
y = '2020'
if int(n) > 286:
y = '2021'
if (datetime.strptime(f'{y}-{m}-{d}', '%Y-%m-%d') - last_date).days < 0:
pasts += 1
continue
name = f'{y}-{m}-{d}.{fmt}'
is_report = True
elif re.fullmatch(r'boletim\-\d{3}\-\d{2}\-\d{2}\.(jpeg|jpg|png)', name):
fmt = name.split('.')[1]
n, d, m= name.split('.')[0].split('-')[-3:]
y = '2020'
if int(n) > 286:
y = '2021'
if (datetime.strptime(f'{y}-{m}-{d}', '%Y-%m-%d') - last_date).days < 0:
pasts += 1
continue
name = f'{y}-{m}-{d}.{fmt}'
is_report = True
if name not in os.listdir('boletins'):
pasts = 0
if is_report:
r = browser.session.get(link)
with open(f'boletins/{name}', 'wb') as f:
f.write(r.content)
print('-' + name)
next_pg = browser.find_all(class_='page-link')[-1]
else:
pasts += 1
#done = True
#break
if next_pg.has_attr('aria-label'):
if next_pg['aria-label'] == 'Next':
browser.open(baseurl+next_pg['href'])
i += 1
else:
breakEditor is loading...