Untitled
plain_text
2 months ago
3.0 kB
1
Indexable
Never
# -*- coding: utf-8 -*- import aiohttp import asyncio import json from lxml import html # Telegram API parameters chat_id = '977882166' bot_token = '6137160905:AAH2IQe5ievrceSihoy1QzVhKBdbITBhwqg' send_message_url = f'https://api.telegram.org/bot{bot_token}/sendMessage' # Load stored data from json file try: with open("stored_data.json") as f: stored_data = json.load(f) except: stored_data = {} # Read the urls from the scrape_urls.txt file with open("scrape_urls.txt") as f: urls = f.read().splitlines() # Scraping function async def scrape_url(session, url): async with session.get(url) as response: response = await response.text() tree = html.fromstring(response) # Check if the URL is a product page if "LH_Sold=1" in url: verkauft = "Verkauft" else: verkauft = "" listings = tree.xpath("//li[@class='s-item s-item__pl-on-bottom']") for listing in listings: title_element = listing.xpath(".//img/@alt") if title_element: title = title_element[0].encode("latin-1", errors="replace").decode("utf-8", "ignore").strip() else: title = "N/A" price = listing.xpath(".//span[@class='s-item__price']/text()") if price: price = price[0] else: price = listing.xpath(".//span[@class='s-item__price']//span/text()") if price: price = price[0] else: price = "N/A" shipping_cost = listing.xpath(".//span[@class='s-item__shipping s-item__logisticsCost']/text()") if shipping_cost: shipping_cost = shipping_cost[0].replace("+EUR", "+ EUR") else: shipping_cost = "N/A" url = listing.xpath(".//a/@href") if url: url = url[0] else: url = "N/A" item_id = url.split("/")[-1].split("?")[0] if item_id not in stored_data: data = {'chat_id': chat_id, 'text': f'{verkauft} {price} {shipping_cost}\n\n{title}\n\n{url}'} await session.post(send_message_url, data=data) stored_data[item_id] = title with open("stored_data.json", "w") as f: json.dump(stored_data, f) # Use asyncio to scrape multiple URLs simultaneously async def main(): async with aiohttp.ClientSession() as session: tasks = [scrape_url(session, url) for url in urls] await asyncio.gather(*tasks) if __name__ == "__main__": asyncio.run(main()) # Delete the first 400 data points in stored_data if the number of data points exceeds 500 if len(stored_data) > 500: stored_data = {k: stored_data[k] for k in list(stored_data)[400:]} # Save the updated stored data to the json file with open("stored_data.json", "w") as f: json.dump(stored_data, f)