Untitled

mail@pastecode.io avatar
unknown
python
2 years ago
3.6 kB
5
Indexable
Never
import pandas as pd
import requests
from bs4 import BeautifulSoup
from typing import List
import csv
from dataclasses import dataclass, asdict

class Car:
    link: str
    full_name: str
    year_class: str
    year: str
    mileage: str
    engine_capacity: str
    fuel_type: str
    price_pln: int

class OtomotoScraper:
    def __init__(self, car_make: str) -> None:
        self.headers = {
            "User-Agent":   "Mozilla/5.0 (X11; Linux x86_64) "
            "AppleWebKit/537.11 (KHTML, like Gecko) "
            "Chrome/23.0.1271.64 Safari/537.11",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
            "Accept-Encoding": "none",
            "Accept-Language": "en-US,en;q=0.8",
            "Connection": "keep-alive",            
        }
        self.car_make = car_make
        self.website = "https://www.otomoto.pl/osobowe"

    def scrape_pages(self, number_of_pages: int) -> List[Car]:
        cars = []
        for i in range(1, number_of_pages + 1):
            current_page = f"{self.website}/{self.car_make}/?page{i}"
            new_cars = self.scrape_cars_from_page(current_page)
            if new_cars:
                cars += new_cars
        return cars

    def scrape_cars_from_page(self, current_page: str) -> List[Car]:
        try:
            response = requests.get(current_page, headers= self.headers).text
            soup = BeautifulSoup(response, "html.parser")
            cars = self.extract_cars_from_page(soup)
            return cars
        except Exception as e:
            print(f"Problem z {current_page}, reason: {e}")
            return []
    
    def extract_cars_from_page(self, soup: BeautifulSoup) -> List[Car]:
        offers_table = soup.find('main', attrs = {'data-testid': 'search-results'})
        cars = offers_table.find_all('article')

        list_of_cars = []
        for car in cars:
            try:
                link = car.find('h2', attrs = {'data-testid': 'ad-title'}).find('a', href=True).get('href')
                full_name = car.find('h2', attrs = {'data-testid': 'ad-title'}).text
                price_pln = car.find('div', class_="e1b25f6f10 ooa-dsk6y6 er34gjf0").find('span', class_='ooa-1bmnxg7 e1b25f6f11').text
                attrs = car.find_all('li', class_='ooa-1k7nwcr e1teo0cs0')
                year = attrs[0].text
                mileage = attrs[1].text
                engine_capacity = attrs[2].text
                fuel_type = attrs[3].text

                list_of_cars.append(
                    car(
                        link=link,
                        full_name=full_name,
                        price_pln=price_pln,
                        year=year,
                        mileage=mileage,
                        engine_capacity=engine_capacity,
                        fuel_type=fuel_type
                ))
            except Exception as e:
                print(f"Error {e}")
        return list_of_cars

def write_to_csv(cars: List[Car]) -> None:

    with open('cars.csv', mode='w') as file:
        writer = csv.writer(file)
        writer.writerow(['link','full_name', 'price_pln','year','mileage','engine_capacity','fuel_type'])
        for car in cars:
            writer.writerow(asdict(car))

def scrape_otomoto() -> None:
    make = 'bmw'
    scraper = OtomotoScraper(make)
    cars = scraper.scrape_pages(1)
    print('a')
    '''write_to_csv(cars)'''

if __name__ == '__main__':
    scrape_otomoto()