Untitled
unknown
plain_text
2 years ago
3.0 kB
12
Indexable
# -*- coding: utf-8 -*-
import aiohttp
import asyncio
import json
from lxml import html
# Telegram API parameters
chat_id = '977882166'
bot_token = '6137160905:AAH2IQe5ievrceSihoy1QzVhKBdbITBhwqg'
send_message_url = f'https://api.telegram.org/bot{bot_token}/sendMessage'
# Load stored data from json file
try:
with open("stored_data.json") as f:
stored_data = json.load(f)
except:
stored_data = {}
# Read the urls from the scrape_urls.txt file
with open("scrape_urls.txt") as f:
urls = f.read().splitlines()
# Scraping function
async def scrape_url(session, url):
async with session.get(url) as response:
response = await response.text()
tree = html.fromstring(response)
# Check if the URL is a product page
if "LH_Sold=1" in url:
verkauft = "Verkauft"
else:
verkauft = ""
listings = tree.xpath("//li[@class='s-item s-item__pl-on-bottom']")
for listing in listings:
title_element = listing.xpath(".//img/@alt")
if title_element:
title = title_element[0].encode("latin-1", errors="replace").decode("utf-8", "ignore").strip()
else:
title = "N/A"
price = listing.xpath(".//span[@class='s-item__price']/text()")
if price:
price = price[0]
else:
price = listing.xpath(".//span[@class='s-item__price']//span/text()")
if price:
price = price[0]
else:
price = "N/A"
shipping_cost = listing.xpath(".//span[@class='s-item__shipping s-item__logisticsCost']/text()")
if shipping_cost:
shipping_cost = shipping_cost[0].replace("+EUR", "+ EUR")
else:
shipping_cost = "N/A"
url = listing.xpath(".//a/@href")
if url:
url = url[0]
else:
url = "N/A"
item_id = url.split("/")[-1].split("?")[0]
if item_id not in stored_data:
data = {'chat_id': chat_id, 'text': f'{verkauft} {price} {shipping_cost}\n\n{title}\n\n{url}'}
await session.post(send_message_url, data=data)
stored_data[item_id] = title
with open("stored_data.json", "w") as f:
json.dump(stored_data, f)
# Use asyncio to scrape multiple URLs simultaneously
async def main():
async with aiohttp.ClientSession() as session:
tasks = [scrape_url(session, url) for url in urls]
await asyncio.gather(*tasks)
if __name__ == "__main__":
asyncio.run(main())
# Delete the first 400 data points in stored_data if the number of data points exceeds 500
if len(stored_data) > 500:
stored_data = {k: stored_data[k] for k in list(stored_data)[400:]}
# Save the updated stored data to the json file
with open("stored_data.json", "w") as f:
json.dump(stored_data, f)Editor is loading...