Benja
asdunknown
python
4 years ago
8.6 kB
5
Indexable
__author__ = 'Benjamin Martinez Garate' from urllib.request import AbstractDigestAuthHandler, urlopen from bs4 import BeautifulSoup import requests import re import time import pandas as pd from pandas import ExcelWriter re.compile('<title>(.*)</title>') def theClinicScraper(): # url theClinic URL = "https://www.theclinic.cl/" # Peticion a la web req = requests.get(URL) # para revisar la respuesta del servidor es correcta status_code = req.status_code if status_code == 200: # Contenido HTML html = BeautifulSoup(req.text, "html.parser") # Obtener todos los div de las noticias con clase data noticias = html.find_all('div', {'class': 'data'}) # Recorrer las noticias para extraer el titulo y parrafo de ellas for i, noticia in enumerate(noticias): # Con el metodo getText no se devuelve el HTML solo en texto puro titulo = noticia.find('div', {'class': 'title'}).getText() autor = noticia.find('p').getText() links = noticia.find('a') # Print de titulos, texto y la iteracion para saber la cantidad de noticias print(titulo) print(autor) print("Url: ", links) print('Numero de iteracion: ', i + 1) print('----------------------------------------------------') else: print("Status code %d", status_code) def theClinicSeleccionNacional(coleccion_noticias): # NOTA, HAY UNA CLASSE LLAMADA PAGINATION, DONDE PUEDO EXTRAER LA URL PARA IR ITERANDO MAS DINAMICAMENTE URL = "https://www.theclinic.cl/categoria/seleccion-nacional/" req = requests.get(URL) status_code = req.status_code list_aut = [] list_date = [] list_tittle = [] list_resume = [] list_p = [] if status_code == 200: html = BeautifulSoup(req.text, "html.parser") bodyNews = html.find_all('div', {'class': 'item'}) for bodyNew in bodyNews: title = bodyNew.find('div', {'class': 'title'}).getText() autorDate = bodyNew.find('p').getText() enlace = bodyNew.find('a')['href'] req_url = requests.get(enlace) soup = BeautifulSoup(req_url.text, "html.parser") body = soup.find_all('div', {'class': 'contents parent'}) for noticia in body: h1 = noticia.find('h1').getText() author = noticia.find('p', {'class': 'author'}).getText() date = noticia.find('p', {'class': 'date'}).getText() resume = noticia.find('div', {'class': 'excerpt'}).getText() paragraph = noticia.find('div', {'class': 'contentPost'}).getText() print('\n') print(" . . . ") print('\n') print(h1) list_tittle.append(h1) print(author) list_aut.append(author) print(date) list_date.append(date) print(resume) list_resume.append(resume) print(paragraph) list_p.append(paragraph) for index, element in enumerate(list_tittle): nueva_noticia = { 'Titulo': list_tittle[index], 'Autor': list_aut[index], 'Fecha': list_date[index], 'Resumen': list_resume[index], 'Parrafo': list_p[index], } coleccion_noticias.insert_one(nueva_noticia) print('----------------------------------------------------') ''' url_pages2 = "https://www.theclinic.cl/categoria/seleccion-nacional/page/2/" req2 = requests.get(url_pages2) html2 = BeautifulSoup(req2.text, "html.parser") body2 = html2.find_all('div', {'class': 'item'}) for i, bodyNew2 in enumerate(body2): title2 = bodyNew2.find('div', {'class': 'title'}).getText() paragraph2 = bodyNew2.find('p').getText() print(title2) print(paragraph2) print('----------------------------------------------------') url_pages3 = "https://www.theclinic.cl/categoria/seleccion-nacional/page/3/" req3 = requests.get(url_pages3) html3 = BeautifulSoup(req3.text, "html.parser") body3 = html3.find_all('div', {'class': 'item'}) for i, bodyNew3 in enumerate(body3): title3 = bodyNew3.find('div', {'class': 'title'}).getText() paragraph3 = bodyNew3.find('p').getText() print(title3) print(paragraph3) print('----------------------------------------------------') url_pages4 = "https://www.theclinic.cl/categoria/seleccion-nacional/page/4/" req4 = requests.get(url_pages4) html4 = BeautifulSoup(req4.text, "html.parser") body4 = html4.find_all('div', {'class': 'item'}) for i, bodyNew4 in enumerate(body4): title4 = bodyNew4.find('div', {'class': 'title'}).getText() paragraph4 = bodyNew4.find('p').getText() print(title4) print(paragraph4) print('----------------------------------------------------') url_pages5 = "https://www.theclinic.cl/categoria/seleccion-nacional/page/5/" req5 = requests.get(url_pages5) html5 = BeautifulSoup(req5.text, "html.parser") body5 = html5.find_all('div', {'class': 'item'}) for i, bodyNew5 in enumerate(body5): title5 = bodyNew5.find('div', {'class': 'title'}).getText() paragraph5 = bodyNew5.find('p').getText() print(title5) print(paragraph5) print('----------------------------------------------------') ''' else: print("Status code %d", status_code) def emolNacional(): # le falta a esta funcion URL = "https://www.emol.com/nacional/" req = requests.get(URL) status_code = req.status_code if status_code == 200: html = BeautifulSoup(req.text, "html.parser") bodyMiddle = html.find_all('div', {'class': 'cont_378_e_2015'}) for i, mid in enumerate(bodyMiddle): title = mid.find('h1').getText() paragraph = mid.find('p').getText() link = mid.find('a') print(title) print(paragraph) print(link) print("---------------------") otherNews = html.find_all('div', {'class': 'col_center_noticia4dest-360px bor_destacado'}) for j, newsCenter in enumerate(otherNews): titulo = newsCenter.find('h3').getText() hora = newsCenter.find('span').getText() print(titulo) print(hora) print("*******************") else: print("Status code %d", status_code) def laTerceraNacional(): URL = "https://www.latercera.com/canal/nacional/" req = requests.get(URL) status_code = req.status_code if status_code == 200: soup = BeautifulSoup(req.text, "html.parser") body1 = soup.find_all('section', {'class': 'top-mainy'}) for i, bodies1 in enumerate(body1): titulo = bodies1.find('h3').getText() a = bodies1.find('a') print(titulo) print(a) print("------------------------") else: print("Status code %d", status_code) def get_database(): from pymongo import MongoClient import pymongo # Provide the mongodb atlas url to connect python to mongodb using pymongo CONNECTION_STRING = "mongodb://localhost:27017/Titulo?readPreference=primary&appname=MongoDB%20Compass&ssl=false" # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient from pymongo import MongoClient client = MongoClient(CONNECTION_STRING) # Create the database for our example (we will use the same database throughout the tutorial return client['titulo'] # Llamando a funciones db = get_database() coleccion_noticias = db["benja"] # TheClinic # theClinicScraper() theClinicSeleccionNacional(coleccion_noticias) # LUN # La Cuarta # La Tercera # laTerceraNacional() # Emol # emolNacional()
Editor is loading...