Benja
asdunknown
python
4 years ago
8.6 kB
7
Indexable
__author__ = 'Benjamin Martinez Garate'
from urllib.request import AbstractDigestAuthHandler, urlopen
from bs4 import BeautifulSoup
import requests
import re
import time
import pandas as pd
from pandas import ExcelWriter
re.compile('<title>(.*)</title>')
def theClinicScraper():
# url theClinic
URL = "https://www.theclinic.cl/"
# Peticion a la web
req = requests.get(URL)
# para revisar la respuesta del servidor es correcta
status_code = req.status_code
if status_code == 200:
# Contenido HTML
html = BeautifulSoup(req.text, "html.parser")
# Obtener todos los div de las noticias con clase data
noticias = html.find_all('div', {'class': 'data'})
# Recorrer las noticias para extraer el titulo y parrafo de ellas
for i, noticia in enumerate(noticias):
# Con el metodo getText no se devuelve el HTML solo en texto puro
titulo = noticia.find('div', {'class': 'title'}).getText()
autor = noticia.find('p').getText()
links = noticia.find('a')
# Print de titulos, texto y la iteracion para saber la cantidad de noticias
print(titulo)
print(autor)
print("Url: ", links)
print('Numero de iteracion: ', i + 1)
print('----------------------------------------------------')
else:
print("Status code %d", status_code)
def theClinicSeleccionNacional(coleccion_noticias):
# NOTA, HAY UNA CLASSE LLAMADA PAGINATION, DONDE PUEDO EXTRAER LA URL PARA IR ITERANDO MAS DINAMICAMENTE
URL = "https://www.theclinic.cl/categoria/seleccion-nacional/"
req = requests.get(URL)
status_code = req.status_code
list_aut = []
list_date = []
list_tittle = []
list_resume = []
list_p = []
if status_code == 200:
html = BeautifulSoup(req.text, "html.parser")
bodyNews = html.find_all('div', {'class': 'item'})
for bodyNew in bodyNews:
title = bodyNew.find('div', {'class': 'title'}).getText()
autorDate = bodyNew.find('p').getText()
enlace = bodyNew.find('a')['href']
req_url = requests.get(enlace)
soup = BeautifulSoup(req_url.text, "html.parser")
body = soup.find_all('div', {'class': 'contents parent'})
for noticia in body:
h1 = noticia.find('h1').getText()
author = noticia.find('p', {'class': 'author'}).getText()
date = noticia.find('p', {'class': 'date'}).getText()
resume = noticia.find('div', {'class': 'excerpt'}).getText()
paragraph = noticia.find('div', {'class': 'contentPost'}).getText()
print('\n')
print(" . . . ")
print('\n')
print(h1)
list_tittle.append(h1)
print(author)
list_aut.append(author)
print(date)
list_date.append(date)
print(resume)
list_resume.append(resume)
print(paragraph)
list_p.append(paragraph)
for index, element in enumerate(list_tittle):
nueva_noticia = {
'Titulo': list_tittle[index],
'Autor': list_aut[index],
'Fecha': list_date[index],
'Resumen': list_resume[index],
'Parrafo': list_p[index],
}
coleccion_noticias.insert_one(nueva_noticia)
print('----------------------------------------------------')
'''
url_pages2 = "https://www.theclinic.cl/categoria/seleccion-nacional/page/2/"
req2 = requests.get(url_pages2)
html2 = BeautifulSoup(req2.text, "html.parser")
body2 = html2.find_all('div', {'class': 'item'})
for i, bodyNew2 in enumerate(body2):
title2 = bodyNew2.find('div', {'class': 'title'}).getText()
paragraph2 = bodyNew2.find('p').getText()
print(title2)
print(paragraph2)
print('----------------------------------------------------')
url_pages3 = "https://www.theclinic.cl/categoria/seleccion-nacional/page/3/"
req3 = requests.get(url_pages3)
html3 = BeautifulSoup(req3.text, "html.parser")
body3 = html3.find_all('div', {'class': 'item'})
for i, bodyNew3 in enumerate(body3):
title3 = bodyNew3.find('div', {'class': 'title'}).getText()
paragraph3 = bodyNew3.find('p').getText()
print(title3)
print(paragraph3)
print('----------------------------------------------------')
url_pages4 = "https://www.theclinic.cl/categoria/seleccion-nacional/page/4/"
req4 = requests.get(url_pages4)
html4 = BeautifulSoup(req4.text, "html.parser")
body4 = html4.find_all('div', {'class': 'item'})
for i, bodyNew4 in enumerate(body4):
title4 = bodyNew4.find('div', {'class': 'title'}).getText()
paragraph4 = bodyNew4.find('p').getText()
print(title4)
print(paragraph4)
print('----------------------------------------------------')
url_pages5 = "https://www.theclinic.cl/categoria/seleccion-nacional/page/5/"
req5 = requests.get(url_pages5)
html5 = BeautifulSoup(req5.text, "html.parser")
body5 = html5.find_all('div', {'class': 'item'})
for i, bodyNew5 in enumerate(body5):
title5 = bodyNew5.find('div', {'class': 'title'}).getText()
paragraph5 = bodyNew5.find('p').getText()
print(title5)
print(paragraph5)
print('----------------------------------------------------')
'''
else:
print("Status code %d", status_code)
def emolNacional():
# le falta a esta funcion
URL = "https://www.emol.com/nacional/"
req = requests.get(URL)
status_code = req.status_code
if status_code == 200:
html = BeautifulSoup(req.text, "html.parser")
bodyMiddle = html.find_all('div', {'class': 'cont_378_e_2015'})
for i, mid in enumerate(bodyMiddle):
title = mid.find('h1').getText()
paragraph = mid.find('p').getText()
link = mid.find('a')
print(title)
print(paragraph)
print(link)
print("---------------------")
otherNews = html.find_all('div', {'class': 'col_center_noticia4dest-360px bor_destacado'})
for j, newsCenter in enumerate(otherNews):
titulo = newsCenter.find('h3').getText()
hora = newsCenter.find('span').getText()
print(titulo)
print(hora)
print("*******************")
else:
print("Status code %d", status_code)
def laTerceraNacional():
URL = "https://www.latercera.com/canal/nacional/"
req = requests.get(URL)
status_code = req.status_code
if status_code == 200:
soup = BeautifulSoup(req.text, "html.parser")
body1 = soup.find_all('section', {'class': 'top-mainy'})
for i, bodies1 in enumerate(body1):
titulo = bodies1.find('h3').getText()
a = bodies1.find('a')
print(titulo)
print(a)
print("------------------------")
else:
print("Status code %d", status_code)
def get_database():
from pymongo import MongoClient
import pymongo
# Provide the mongodb atlas url to connect python to mongodb using pymongo
CONNECTION_STRING = "mongodb://localhost:27017/Titulo?readPreference=primary&appname=MongoDB%20Compass&ssl=false"
# Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
from pymongo import MongoClient
client = MongoClient(CONNECTION_STRING)
# Create the database for our example (we will use the same database throughout the tutorial
return client['titulo']
# Llamando a funciones
db = get_database()
coleccion_noticias = db["benja"]
# TheClinic
# theClinicScraper()
theClinicSeleccionNacional(coleccion_noticias)
# LUN
# La Cuarta
# La Tercera
# laTerceraNacional()
# Emol
# emolNacional()Editor is loading...