Untitled
plain_text
2 months ago
9.5 kB
2
Indexable
Never
import requests from bs4 import BeautifulSoup from prettytable import PrettyTable import pandas as pd from datetime import datetime, timedelta import os import time import asyncio from telegram import Bot import matplotlib.pyplot as plt import seaborn as sns bot_token = '6380703806:AAEkDw_JAveSyWoz99pfgD_D0Kjgv8BjzaQ' channel_id = '@carpointaz' async def send_text_to_telegram(message): bot = Bot(token=bot_token) await bot.send_message(chat_id=channel_id, text=message, parse_mode='Markdown') time.sleep(15) # add delay to avoid hitting Telegram's rate limits async def send_image_to_telegram(message, image_path): bot = Bot(token=bot_token) await bot.send_photo(chat_id=channel_id, photo=open(image_path, 'rb'), caption=message) time.sleep(15) # add delay to avoid hitting Telegram's rate limits # Excel file path excel_file_path = 'turbo_cars.xlsx' # Function to scrape and process the website def job(): global url # URL of the page to scrape url = 'https://turbo.az/autos?q%5Bsort%5D=&q%5Bmake%5D%5B%5D=1&q%5Bmodel%5D%5B%5D=&q%5Bmodel%5D%5B%5D=99&q%5Bused%5D=&q%5Bregion%5D%5B%5D=&q%5Bprice_from%5D=&q%5Bprice_to%5D=&q%5Bcurrency%5D=azn&q%5Bloan%5D=0&q%5Bbarter%5D=0&q%5Bcategory%5D%5B%5D=&q%5Byear_from%5D=2015&q%5Byear_to%5D=2016&q%5Bcolor%5D%5B%5D=&q%5Bfuel_type%5D%5B%5D=&q%5Bfuel_type%5D%5B%5D=5&q%5Bfuel_type%5D%5B%5D=6&q%5Bgear%5D%5B%5D=&q%5Btransmission%5D%5B%5D=&q%5Bengine_volume_from%5D=&q%5Bengine_volume_to%5D=&q%5Bpower_from%5D=&q%5Bpower_to%5D=&q%5Bmileage_from%5D=&q%5Bmileage_to%5D=&q%5Bonly_shops%5D=&q%5Bprior_owners_count%5D%5B%5D=&q%5Bseats_count%5D%5B%5D=&q%5Bmarket%5D%5B%5D=&q%5Bcrashed%5D=1&q%5Bpainted%5D=1&q%5Bfor_spare_parts%5D=0' # Set your URL here # Load last results if exist last_results = pd.read_excel(excel_file_path) if os.path.exists(excel_file_path) else None # Creating a PrettyTable with the specified columns table = PrettyTable() table.field_names = ["Avtomobil", "İl", "Mühərrik", "Yürüş", "Qiymət", "Şəhər", "Yerləşdirilib", "Saytdakı vaxtı", "Link", "Vəziyyəti"] # List to store data for Excel export excel_data = [] product_count = 0 # Headers for the request headers = { 'User-Agent': 'Mozilla/5.0', } while url: # Sending a GET request to fetch the HTML content response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') # Manually identify the part of the HTML that contains the product listings elanlar_section = soup.find('p', class_='section-title_name', string='ELANLAR') products_section = elanlar_section.find_next('div', class_='products') if products_section is None: print("Failed to find product listings.") break # Extracting the required details for each product within the products section for product_div in products_section.find_all('div', class_='products-i'): try: avtomobil = product_div.find('div', class_='products-i__name products-i__bottom-text').get_text().strip() il_muherrik_yurus = product_div.find('div', class_='products-i__attributes products-i__bottom-text').get_text().strip() il, muherrik, yurus = il_muherrik_yurus.split(', ') qiymet = product_div.find('div', class_='product-price').get_text().strip() sheher, yerlesdirilib = product_div.find('div', class_='products-i__datetime').get_text().strip().split(' ', 1) sheher = sheher.rstrip(',') # Replacing specific words with formatted dates today = datetime.today() if 'bugün' in yerlesdirilib: yerlesdirilib = yerlesdirilib.replace('bugün', today.strftime('%d.%m.%Y')) elif 'dünən' in yerlesdirilib: yesterday = today - timedelta(days=1) yerlesdirilib = yerlesdirilib.replace('dünən', yesterday.strftime('%d.%m.%Y')) # Extracting date from the "Yerləşdirilib" column and calculating "Saytdakı vaxtı" yerlesdirilib_date_str = yerlesdirilib.split(' ')[0] yerlesdirilib_date = datetime.strptime(yerlesdirilib_date_str, '%d.%m.%Y') saytdaki_vaxti = (today - yerlesdirilib_date).days relative_link = product_div.find('a', class_='products-i__link')['href'] link = "https://turbo.az" + relative_link # Adding the extracted details to the table row_data = [avtomobil, il, muherrik, yurus, qiymet, sheher, yerlesdirilib, saytdaki_vaxti, link] # Check the "Vəziyyəti" of the product if last_results is not None: if link in last_results['Link'].values: vaziyyeti = "-" else: vaziyyeti = "Yeni" else: vaziyyeti = "Yeni" row_data.append(vaziyyeti) table.add_row(row_data) excel_data.append(row_data) product_count += 1 except AttributeError: # Handle cases where the expected elements are not found continue # Find the next page URL next_link = soup.find('a', rel='next') url = "https://turbo.az" + next_link['href'] if next_link else None # Update "Vəziyyəti" of the products that are not in new results but in the last results if last_results is not None: last_links = set(last_results['Link'].values) new_links = set([row[-2] for row in excel_data]) sold_links = last_links - new_links for link in sold_links: row_index = last_results[last_results['Link'] == link].index[0] last_results.loc[row_index, 'Vəziyyəti'] = 'Yeni Satılıb' # Append the sold products to new results sold_products = last_results[last_results['Vəziyyəti'] == 'Yeni Satılıb'] excel_data.extend(sold_products.values.tolist()) # Printing the PrettyTable print(table) # Printing the product count print(f"Total products found: {product_count}") # Creating a DataFrame and saving to Excel (using Excel format) columns = ["Avtomobil", "İl", "Mühərrik", "Yürüş", "Qiymət", "Şəhər", "Yerləşdirilib", "Saytdakı vaxtı", "Link", "Vəziyyəti"] df = pd.DataFrame(excel_data, columns=columns) df.to_excel(excel_file_path, index=False, engine='openpyxl') def clean_excel_data(): df = pd.read_excel('turbo_cars.xlsx') df.drop_duplicates(subset='Link', keep='first', inplace=True) df.to_excel('turbo_cars.xlsx', index=False) def update_veziyyeti(): df = pd.read_excel('turbo_cars.xlsx') df.loc[df['Vəziyyəti'] == 'Yeni Satılıb', 'Vəziyyəti'] = 'Satılıb' df.to_excel('turbo_cars.xlsx', index=False) def read_excel_data(): clean_excel_data() # Clean the Excel data before reading df = pd.read_excel('turbo_cars.xlsx') return df def create_plot(df, plot_num): df_count = df.groupby(["Avtomobil", "Vəziyyəti"]).size().reset_index(name="count") plt.figure(figsize=(10,5)) sns.barplot(y="Avtomobil", x="count", hue="Vəziyyəti", data=df_count) plt.title('Avtomobil Statistics') plt.savefig(f'plot_{plot_num}.png') # Save the plot as an image plt.close() # Close the plot to free up memory async def main(): # Create a set to store the links of cars that have been sent sent_cars = set() while True: print("Starting a new run...") await send_text_to_telegram("Starting a new run...") # Run the job function inside the while loop job() car_data = read_excel_data() yeni_count = 0 yeni_satilib_count = 0 total_cars = len(car_data) for index, row in car_data.iterrows(): if row['Vəziyyəti'] in ['Yeni', 'Yeni Satılıb'] and row['Link'] not in sent_cars: message = f"*{row['Avtomobil']}*\nİl: {row['İl']}\nMühərrik: {row['Mühərrik']}\nYürüş: {row['Yürüş']}\nQiymət: {row['Qiymət']}\nŞəhər: {row['Şəhər']}\nYerləşdirilib: {row['Yerləşdirilib']}\nSaytdakı vaxtı: {row['Saytdakı vaxtı']}\nLink: [Baxmaq üçün basın]({row['Link']})\nVəziyyəti: {row['Vəziyyəti']}\n-------------------" await send_text_to_telegram(message) sent_cars.add(row['Link']) if row['Vəziyyəti'] == 'Yeni': yeni_count += 1 elif row['Vəziyyəti'] == 'Yeni Satılıb': yeni_satilib_count += 1 update_veziyyeti() # Update the status in the Excel file print(f"Sent {yeni_count} new cars and {yeni_satilib_count} new sold cars to Telegram.") await send_text_to_telegram(f"Sent {yeni_count} new cars and {yeni_satilib_count} new sold cars to Telegram.") # Create and send the plot create_plot(car_data, 1) await send_image_to_telegram("Here is the plot for this run:", "plot_1.png") time.sleep(10) # Wait for an hour before the next run if __name__ == "__main__": asyncio.run(main())