Benja

asd
mail@pastecode.io avatar
unknown
python
3 years ago
8.6 kB
3
Indexable
Never
__author__ = 'Benjamin Martinez Garate'

from urllib.request import AbstractDigestAuthHandler, urlopen
from bs4 import BeautifulSoup
import requests
import re
import time
import pandas as pd
from pandas import ExcelWriter

re.compile('<title>(.*)</title>')


def theClinicScraper():
    # url theClinic
    URL = "https://www.theclinic.cl/"

    # Peticion a la web
    req = requests.get(URL)

    # para revisar la respuesta del servidor es correcta
    status_code = req.status_code

    if status_code == 200:
        # Contenido HTML
        html = BeautifulSoup(req.text, "html.parser")

        # Obtener todos los div de las noticias con clase data
        noticias = html.find_all('div', {'class': 'data'})

        # Recorrer las noticias para extraer el titulo y parrafo de ellas
        for i, noticia in enumerate(noticias):
            # Con el metodo getText no se devuelve el HTML solo en texto puro
            titulo = noticia.find('div', {'class': 'title'}).getText()
            autor = noticia.find('p').getText()
            links = noticia.find('a')

            # Print de titulos, texto y la iteracion para saber la cantidad de noticias
            print(titulo)
            print(autor)
            print("Url: ", links)
            print('Numero de iteracion: ', i + 1)
            print('----------------------------------------------------')

    else:
        print("Status code %d", status_code)

def theClinicSeleccionNacional(coleccion_noticias):
    # NOTA, HAY UNA CLASSE LLAMADA PAGINATION, DONDE PUEDO EXTRAER LA URL PARA IR ITERANDO MAS DINAMICAMENTE
    URL = "https://www.theclinic.cl/categoria/seleccion-nacional/"
    req = requests.get(URL)
    status_code = req.status_code
    list_aut = []
    list_date = []
    list_tittle = []
    list_resume = []
    list_p = []

    if status_code == 200:
        html = BeautifulSoup(req.text, "html.parser")
        bodyNews = html.find_all('div', {'class': 'item'})

        for bodyNew in bodyNews:
            title = bodyNew.find('div', {'class': 'title'}).getText()
            autorDate = bodyNew.find('p').getText()
            enlace = bodyNew.find('a')['href']

            req_url = requests.get(enlace)
            soup = BeautifulSoup(req_url.text, "html.parser")
            body = soup.find_all('div', {'class': 'contents parent'})

            for noticia in body:
                h1 = noticia.find('h1').getText()
                author = noticia.find('p', {'class': 'author'}).getText()
                date = noticia.find('p', {'class': 'date'}).getText()
                resume = noticia.find('div', {'class': 'excerpt'}).getText()
                paragraph = noticia.find('div', {'class': 'contentPost'}).getText()
                print('\n')
                print(" . . . ")
                print('\n')
                print(h1)
                list_tittle.append(h1)
                print(author)
                list_aut.append(author)
                print(date)
                list_date.append(date)
                print(resume)
                list_resume.append(resume)
                print(paragraph)
                list_p.append(paragraph)

                for index, element in enumerate(list_tittle):
                    nueva_noticia = {
                        'Titulo': list_tittle[index],
                        'Autor': list_aut[index],
                        'Fecha': list_date[index],
                        'Resumen': list_resume[index],
                        'Parrafo': list_p[index],
                    }
                    coleccion_noticias.insert_one(nueva_noticia)

            print('----------------------------------------------------')

        '''
        url_pages2 = "https://www.theclinic.cl/categoria/seleccion-nacional/page/2/"
        req2 = requests.get(url_pages2)
        html2 = BeautifulSoup(req2.text, "html.parser")
        body2 = html2.find_all('div', {'class': 'item'})

        for i, bodyNew2 in enumerate(body2):
            title2 = bodyNew2.find('div', {'class': 'title'}).getText()
            paragraph2 = bodyNew2.find('p').getText()
            print(title2)
            print(paragraph2)

            print('----------------------------------------------------')

        url_pages3 = "https://www.theclinic.cl/categoria/seleccion-nacional/page/3/"
        req3 = requests.get(url_pages3)
        html3 = BeautifulSoup(req3.text, "html.parser")
        body3 = html3.find_all('div', {'class': 'item'})

        for i, bodyNew3 in enumerate(body3):
            title3 = bodyNew3.find('div', {'class': 'title'}).getText()
            paragraph3 = bodyNew3.find('p').getText()
            print(title3)
            print(paragraph3)

            print('----------------------------------------------------')

        url_pages4 = "https://www.theclinic.cl/categoria/seleccion-nacional/page/4/"
        req4 = requests.get(url_pages4)
        html4 = BeautifulSoup(req4.text, "html.parser")
        body4 = html4.find_all('div', {'class': 'item'})

        for i, bodyNew4 in enumerate(body4):
            title4 = bodyNew4.find('div', {'class': 'title'}).getText()
            paragraph4 = bodyNew4.find('p').getText()
            print(title4)
            print(paragraph4)

            print('----------------------------------------------------')

        url_pages5 = "https://www.theclinic.cl/categoria/seleccion-nacional/page/5/"
        req5 = requests.get(url_pages5)
        html5 = BeautifulSoup(req5.text, "html.parser")
        body5 = html5.find_all('div', {'class': 'item'})

        for i, bodyNew5 in enumerate(body5):
            title5 = bodyNew5.find('div', {'class': 'title'}).getText()
            paragraph5 = bodyNew5.find('p').getText()
            print(title5)
            print(paragraph5)

            print('----------------------------------------------------')
        '''

    else:
        print("Status code %d", status_code)


def emolNacional():
    # le falta a esta funcion

    URL = "https://www.emol.com/nacional/"
    req = requests.get(URL)
    status_code = req.status_code

    if status_code == 200:
        html = BeautifulSoup(req.text, "html.parser")
        bodyMiddle = html.find_all('div', {'class': 'cont_378_e_2015'})

        for i, mid in enumerate(bodyMiddle):
            title = mid.find('h1').getText()
            paragraph = mid.find('p').getText()
            link = mid.find('a')

            print(title)
            print(paragraph)
            print(link)
            print("---------------------")

        otherNews = html.find_all('div', {'class': 'col_center_noticia4dest-360px bor_destacado'})
        for j, newsCenter in enumerate(otherNews):
            titulo = newsCenter.find('h3').getText()
            hora = newsCenter.find('span').getText()

            print(titulo)
            print(hora)
            print("*******************")

    else:
        print("Status code %d", status_code)


def laTerceraNacional():
    URL = "https://www.latercera.com/canal/nacional/"
    req = requests.get(URL)
    status_code = req.status_code

    if status_code == 200:
        soup = BeautifulSoup(req.text, "html.parser")
        body1 = soup.find_all('section', {'class': 'top-mainy'})

        for i, bodies1 in enumerate(body1):
            titulo = bodies1.find('h3').getText()
            a = bodies1.find('a')

            print(titulo)
            print(a)
            print("------------------------")
    else:
        print("Status code %d", status_code)

def get_database():
    from pymongo import MongoClient
    import pymongo

    # Provide the mongodb atlas url to connect python to mongodb using pymongo
    CONNECTION_STRING = "mongodb://localhost:27017/Titulo?readPreference=primary&appname=MongoDB%20Compass&ssl=false"

    # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
    from pymongo import MongoClient
    client = MongoClient(CONNECTION_STRING)

    # Create the database for our example (we will use the same database throughout the tutorial
    return client['titulo']


# Llamando a funciones
db = get_database()
coleccion_noticias = db["benja"]


# TheClinic
# theClinicScraper()
theClinicSeleccionNacional(coleccion_noticias)

# LUN


# La Cuarta


# La Tercera
# laTerceraNacional()

# Emol
# emolNacional()