Untitled

mail@pastecode.io avatarunknown
plain_text
2 months ago
3.0 kB
1
Indexable
Never
# -*- coding: utf-8 -*-

import aiohttp
import asyncio
import json
from lxml import html

# Telegram API parameters
chat_id = '977882166'
bot_token = '6137160905:AAH2IQe5ievrceSihoy1QzVhKBdbITBhwqg'
send_message_url = f'https://api.telegram.org/bot{bot_token}/sendMessage'

# Load stored data from json file
try:
    with open("stored_data.json") as f:
        stored_data = json.load(f)
except:
    stored_data = {}

# Read the urls from the scrape_urls.txt file
with open("scrape_urls.txt") as f:
    urls = f.read().splitlines()

# Scraping function
async def scrape_url(session, url):
    async with session.get(url) as response:
        response = await response.text()
        tree = html.fromstring(response)
        
        # Check if the URL is a product page
        if "LH_Sold=1" in url:
            verkauft = "Verkauft"
        else:
            verkauft = ""
        
        listings = tree.xpath("//li[@class='s-item s-item__pl-on-bottom']")
        for listing in listings:
            title_element = listing.xpath(".//img/@alt")
            if title_element:
                title = title_element[0].encode("latin-1", errors="replace").decode("utf-8", "ignore").strip()
            else:
                title = "N/A"
            price = listing.xpath(".//span[@class='s-item__price']/text()")
            if price:
                price = price[0]
            else:
                price = listing.xpath(".//span[@class='s-item__price']//span/text()")
                if price:
                    price = price[0]
                else:
                    price = "N/A"
            shipping_cost = listing.xpath(".//span[@class='s-item__shipping s-item__logisticsCost']/text()")
            if shipping_cost:
                shipping_cost = shipping_cost[0].replace("+EUR", "+ EUR")
            else:
                shipping_cost = "N/A"
            url = listing.xpath(".//a/@href")
            if url:
                url = url[0]
            else:
                url = "N/A"
            item_id = url.split("/")[-1].split("?")[0]
            if item_id not in stored_data:
                data = {'chat_id': chat_id, 'text': f'{verkauft} {price} {shipping_cost}\n\n{title}\n\n{url}'}
                await session.post(send_message_url, data=data)
                stored_data[item_id] = title
                with open("stored_data.json", "w") as f:
                    json.dump(stored_data, f)


# Use asyncio to scrape multiple URLs simultaneously
async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [scrape_url(session, url) for url in urls]
        await asyncio.gather(*tasks)

if __name__ == "__main__":
    asyncio.run(main())

# Delete the first 400 data points in stored_data if the number of data points exceeds 500
if len(stored_data) > 500:
    stored_data = {k: stored_data[k] for k in list(stored_data)[400:]}

# Save the updated stored data to the json file
with open("stored_data.json", "w") as f:
    json.dump(stored_data, f)