Untitled
unknown
plain_text
a year ago
25 kB
4
Indexable
Never
import json import time from slugify import slugify import threading import random import asyncio import math from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service import requests PROXY_HOST = '' # rotating proxy or host PROXY_PORT = ''# port PROXY_USER = '' # username PROXY_PASS = '' # password manifest_json="" background_js ="" import pandas as pd import requests import pymysql import os import zipfile # Đường dẫn tới tệp Excel excel_file = './test.xlsx' import aiohttp import asyncio import json connection_data=pymysql.connect(host="103.110.84.6", user="pdjtvluc_tuan", passwd="lazadasp1", db="pdjtvluc_lazadadb",port=3306) async def save_db(item,category_sp): # print(item) is_load = 0; stock = 0 conn = connection_data cursor = conn.cursor() conn.ping() # reconnecting mysql with conn.cursor() as cursor: sql = """ INSERT INTO product(product_id,name ,discount,price,price_min,price_max,typer_shop,historical_sold,image,images,liked_count, link,rating,shop_location,stock,category,is_load ) VALUE(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) """ val =(int(item['itemId']),str(item['name']), float(item['discount']), float(item['price']), float(item['price']), float(item['originalPrice']), str(item['Shopmall']), float(item['itemSoldCntShow']), str(item['image']),str(item['images']),str(item['review']), str(item['itemUrl']), float(item['ratingScore']), str(item['location']), stock,category_sp, is_load, ) cursor.execute(sql,val) conn.commit() conn.close() # Đọc tệp Excel df = pd.read_excel(excel_file) path = "C:/Users/Admin/Desktop/lazada/backend/tool/chromedriver.exe" async def fetch(url): async with aiohttp.ClientSession() as session: async with session.get(url) as response: return await response.text() async def get_proxy_sele(): global PROXY_HOST global PROXY_PORT global PROXY_USER global PROXY_PASS location ="vn_dn" random_number = random.choice([0, 1]) if( random_number == 0): location ="vn_dn" else: location ="vn_hcm" server_host_s = -1 count = 0 while(server_host_s == -1): if(count >= 2): print("dang ngu dong") time.sleep(60) count = count + 1 if(count >= 4): count = 0 try: url = "https://api.tinproxy.com/proxy/get-new-proxy?authen_ips=14.191.157.215&location={}&api_key=70QpHJ5sKS3i7kgdJtuKrdh9M0BsHXbK".format(location) r = await fetch(url) data = json.loads(r) server_host = data['data']['http_ipv6_ipv4'] user_name= data['data']['authentication']['username'] pass_word = data['data']['authentication']['password'] ip, port = server_host.split(":") # print("Địa chỉ IP:", ip) # print("Cổng:", port) PROXY_HOST = ip PROXY_PORT = port PROXY_USER = user_name PROXY_PASS = pass_word my_proxies2 = { "PROXY_HOST":ip, "PROXY_PORT":port, "PROXY_USER": user_name, "PROXY_PASS":pass_word , } if( server_host != None or server_host != ""): server_host_s = 1 except: server_host_s = -1 count = count + 1 print("get proxy cho selenium",my_proxies2) return my_proxies2 async def getmb(): print("lay proxy") await get_proxy_sele() time.sleep(1) global manifest_json global background_js manifest_json = """ { "version": "1.0.0", "manifest_version": 2, "name": "Chrome Proxy", "permissions": [ "proxy", "tabs", "unlimitedStorage", "storage", "<all_urls>", "webRequest", "webRequestBlocking" ], "background": { "scripts": ["background.js"] }, "minimum_chrome_version":"22.0.0" } """ background_js = """ var config = { mode: "fixed_servers", rules: { singleProxy: { scheme: "http", host: "%s", port: parseInt(%s) }, bypassList: ["localhost"] } }; chrome.proxy.settings.set({value: config, scope: "regular"}, function() {}); function callbackFn(details) { return { authCredentials: { username: "%s", password: "%s" } }; } chrome.webRequest.onAuthRequired.addListener( callbackFn, {urls: ["<all_urls>"]}, ['blocking'] ); """ % (PROXY_HOST, PROXY_PORT, PROXY_USER, PROXY_PASS) def get_chromedriver2(use_proxy=True, user_agent=None): path = os.path.dirname(os.path.abspath(__file__)) chrome_options = webdriver.ChromeOptions() pluginfile = 'proxy_auth_plugin.zip' # Kiểm tra xem tệp đã tồn tại hay không if os.path.exists(pluginfile): # Xóa tệp nếu đã tồn tại os.remove(pluginfile) print("Đã xóa tệp", pluginfile) time.sleep(0.2) if use_proxy: pluginfile = 'proxy_auth_plugin.zip' with zipfile.ZipFile(pluginfile, 'w') as zp: zp.writestr("manifest.json", manifest_json) zp.writestr("background.js", background_js) chrome_options.add_extension(pluginfile) if user_agent: chrome_options.add_argument('--user-agent=%s' % user_agent) chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_experimental_option( "excludeSwitches", """ ignore-certificate-errors safebrowsing-disable-download-protection safebrowsing-disable-auto-update disable-client-side-phishing-detection """.split() ) driver = webdriver.Chrome( os.path.join(path, 'chromedriver'), chrome_options=chrome_options) set_device_metrics_override = dict({ "width": 375, "height": 812, "deviceScaleFactor": 50, "mobile": True }) # driver.execute_cdp_cmd('Emulation.setDeviceMetricsOverride', set_device_metrics_override) driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") # driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'}) return driver def get_chromedriver(use_proxy=True, user_agent=None): path = os.path.dirname(os.path.abspath(__file__)) chrome_options = webdriver.ChromeOptions() pluginfile = 'proxy_auth_plugin.zip' time.sleep(0.2) if use_proxy: pluginfile = 'proxy_auth_plugin.zip' with zipfile.ZipFile(pluginfile, 'w') as zp: zp.writestr("manifest.json", manifest_json) zp.writestr("background.js", background_js) chrome_options.add_extension(pluginfile) if user_agent: chrome_options.add_argument('--user-agent=%s' % user_agent) chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"]) chrome_options.add_experimental_option('useAutomationExtension', False) chrome_options.add_experimental_option( "excludeSwitches", """ ignore-certificate-errors safebrowsing-disable-download-protection safebrowsing-disable-auto-update disable-client-side-phishing-detection """.split() ) driver = webdriver.Chrome( os.path.join(path, 'chromedriver'), chrome_options=chrome_options) set_device_metrics_override = dict({ "width": 375, "height": 812, "deviceScaleFactor": 50, "mobile": True }) # driver.execute_cdp_cmd('Emulation.setDeviceMetricsOverride', set_device_metrics_override) driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})") # driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'}) return driver async def get_cookie(url2): counts = 0 # lan 1 if(counts == 0): await getmb() driver = get_chromedriver(use_proxy=True) driver.set_window_rect(100,200,375,812) url ="https://www.lazada.vn/dien-thoai-di-dong/" driver.get(url) driver.execute_script("window.open('{}', '_blank')".format(url2)) driver.switch_to.window(driver.window_handles[-1]) # Chuyển sang tab mới mở try: r=(driver.find_element("xpath",("/html/body/pre")).text) except: driver.close() driver.switch_to.window(driver.window_handles[-1]) xc = 0 while(xc == 0): time.sleep(0.3) counts = 1 time.sleep(0.5) await getmb() time.sleep(0.5) driver = get_chromedriver2(use_proxy=True) driver.set_window_rect(100,200,375,812) url ="https://www.lazada.vn/dien-thoai-di-dong/" driver.get(url) try: driver.execute_script("window.open('{}', '_blank')".format(url2)) r=(driver.find_element("xpath",("/html/body/pre")).text) time.sleep(0.2) xc = 1 break except: xc = 0 driver.switch_to.window(driver.window_handles[-1]) # Chuyển sang tab mới mở driver.close() driver.switch_to.window(driver.window_handles[-1]) driver.close() time.sleep(0.2) return r def get_new_proxy(): r = requests.get("https://api.tinproxy.com/proxy/get-new-proxy?authen_ips=103.107.183.30&location=vn_hcm&api_key=70QpHJ5sKS3i7kgdJtuKrdh9M0BsHXbK") data = json.loads(r.text) try: server_host = data['data']['http_ipv6_ipv4'] user_name= data['data']['authentication']['username'] pass_word = data['data']['authentication']['password'] my_proxies = { "http" : "http://{}:{}@{}".format(user_name,pass_word,server_host), } print(my_proxies) except: server_host = -1 while(server_host == -1): time.sleep(30) r = requests.get("https://api.tinproxy.com/proxy/get-new-proxy?authen_ips=103.107.183.30&location=vn_hcm&api_key=70QpHJ5sKS3i7kgdJtuKrdh9M0BsHXbK") data = json.loads(r.text) try: server_host = data['data']['http_ipv6_ipv4'] user_name= data['data']['authentication']['username'] pass_word = data['data']['authentication']['password'] my_proxies = { "http" : "http://{}:{}@{}".format(user_name,pass_word,server_host), } except: pass return my_proxies def info_item(data,list_item,key1,key2,key3): items = data['mods']['listItems'] brandId = -1 brandName = "" discount = -1 image = "" itemId = -1 itemSoldCntShow="" itemUrl = "" nid = -1 originalPrice = 0 originalPriceShow = "" price = "" priceShow = 0 ratingScor = 0 review = "" sellerId= 0 sellerName = "" itemUrl ="" sku ="" skuId="" Shopmall = "" itemId = -1 images = [] for item in items: brandId = item['brandId'] brandName = item['brandName'] itemId = item['itemId'] try: discount = item['discount'] if(discount != None): discount = discount.split("%")[0] except: discount = 0 image = item['image'] itemId = item['itemId'] try: itemSoldCntShow = item['itemSoldCntShow'] try: itemSoldCntShow = itemSoldCntShow[: itemSoldCntShow.index("sold")].strip() except: pass try: itemSoldCntShow = itemSoldCntShow[: itemSoldCntShow.index("Đã bán")].strip() except: pass try: itemSoldCntShow = itemSoldCntShow[: itemSoldCntShow.index("+ Đã bán")].strip() except: pass try: itemSoldCntShow = itemSoldCntShow[: itemSoldCntShow.index("+")].strip() except: pass try: itemSoldCntShow = itemSoldCntShow[: itemSoldCntShow.index("k")].strip() except: pass itemSoldCntShow = itemSoldCntShow.replace(",",".") except: itemSoldCntShow = 0 itemUrl = item['itemUrl'] location = item['location'] name = item['name'] nid = item['nid'] try: originalPrice = item['originalPrice'] except: pass try: originalPriceShow = item['originalPriceShow'] except: pass try: price = item['price'] except: pass try: priceShow = item['priceShow'] ratingScore = item['ratingScore'] except: pass try: review = item['review'] except: pass sellerId = item['sellerId'] sellerName = item['sellerName'] sku = item['sku'] skuId = item['skuId'] try: Shopmall = item['icons'][0]['alias'] except: pass img_data = item['thumbs']; # print(img_data) try: for item_img in img_data: # print(item_img['image']) images.append(item_img['image']) except: pass name_new = name.lower() key1_new = key1.lower() key2_new = key2.lower() key3_new = key3 if(key3_new != "x"): # print("lay 2 dieu kien") time.sleep(0.1) if(key1_new in name_new and key2_new in name_new): # print(itemId) # print(name) # print("-=----") list_item.append({ "key": itemId, "itemId":itemId, "brandId":brandId, "brandName":brandName, "discount": discount, "image":image , "itemId":itemId, "itemSoldCntShow":float(itemSoldCntShow), "itemUrl": itemUrl , "location":location, "name": name, "nid": nid, # price max "originalPrice":originalPrice, "originalPriceShow": originalPriceShow, "price": float(price), # price min "priceShow":priceShow, "ratingScore":ratingScore, "review": review, "sellerId": sellerId, "Shopmall": Shopmall, "images":images }) else: # print("lay 3 dieu kien") key3_new = key3.lower() if(key1_new in name_new and key2_new in name_new and key3_new in name_new): # print(itemId) # print(name) # print("-=----") list_item.append({ "key": itemId, "itemId":itemId, "brandId":brandId, "brandName":brandName, "discount": discount, "image":image , "itemId":itemId, "itemSoldCntShow": float(itemSoldCntShow), "itemUrl": itemUrl , "location":location, "name": name, "nid": nid, # price max "originalPrice":originalPrice, "originalPriceShow": originalPriceShow, "price": float(price), # price min "priceShow":priceShow, "ratingScore":ratingScore, "review": review, "sellerId": sellerId, "Shopmall": Shopmall, "images":images }) images = [] async def get_data(url,list_data,key1,key2,key3,index): #Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1 #Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 headers = { "cookie":"__wpkreporterwid_=813ed48b-595c-4adf-9f22-24f1b8328e2a; t_fv=1670425558316; t_uid=ZfqbNVCVcuji2pNujFDgD7X8IpYsozLZ; cna=1pkXHJ5Pty8CAXRvHRQmtaHQ; lzd_cid=96503b91-03f6-4a84-a33d-a69b147980af; miidlaz=miidgg5rbu1gmghens6gt8p; lwrid=AQGHIpP%2FCbpv79mAoewzX39uI4vP; _bl_uid=g6l2nf8gqRFoUCf52j0wg9ktCRb8; _gcl_au=1.1.1020666756.1680854480; _ga=GA1.2.303867327.1681100355; cto_axid=qVhKOIlvdz8gBvNCGiZJpvO9HK21W2gI; pdp_sfo=1; dsa_category_disclaimer=true; _gcl_aw=GCL.1684482945.CjwKCAjwvJyjBhApEiwAWz2nLbwA1YN9Bte0-xwC9cjeLL5W8bOre5Flpn6stpzXeN396SwHoXCH5BoCSaQQAvD_BwE; lzd_click_id=clkgl3rhh1h18ddqulh34f; _uetvid=a70580a0d8e511ed8ba31b9e895a65ea; cto_bundle=o16OV19JR2tsbmZUQjNSMlZXdU1nJTJCSnlJTkdsa0FqeHpOZjgwOHU5QyUyQkFzZzRFSU9MYmNwb1h0anY3bWozS1BIaiUyRkxQUnRoS3kzZURzazZKUnBiNjRMZ0hNSWJxVlNEYzc5QnFDNCUyQkswMXBNVTVQaTVEbXlYaDQ5SGxjSnZMS3ZMTWZOWGc1Nml6TE8wRGRkZmRlJTJCWlpYNkd3JTNEJTNE; AMCV_126E248D54200F960A4C98C6%40AdobeOrg=-1124106680%7CMCIDTS%7C19504%7CMCMID%7C52344899913288371074101774141992786277%7CMCAAMLH-1685675655%7C3%7CMCAAMB-1685675655%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1685078055s%7CNONE%7CvVersion%7C5.2.0; sgcookie=E100QFP%2BvohEo9oGTPL5SxXeeRlcKciHrMGTIHpFzH6Ej%2F7suAYVHgFLAVoD6SK81cStjX%2B6v7paxaoNBBGj4OBHTs%2FcXoQaMr2fPRFTDhLnV2A%3D; hng=VN|en|VND|704; userLanguageML=en; xlly_s=1; EGG_SESS=S_Gs1wHo9OvRHCMp98md7DUZe1_Oi7MP7VJBGrkx9fhSS7nQTnQJeqqXEBD6IWifQZY9APBbZ1NYv6XiRfJqEz8D_8i-At2bdf63doGKaAyOxhfxDvfM3CbBdWPewHrMN0lq3c1joYuwyaSJfxbUY4Skd9BkzmJRNg2pqZfhI3M=; ak_bmsc=366442E74FE13AEE7E9E22CBF35E72F3~000000000000000000000000000000~YAAQnwqrcV0uBmaIAQAAw7vqZxOadtWFkCIDKA3lr2K8jKVeIC2TUkasu4H6eKqmQuWeXkL9exGezkGcDBTQ0Osjf3qxB8vWzlGcSI9HHkP+1rA/LyLzgplq5luPovmO/seeTA9FpIpICoYIvBbPi9q6hpL0TfXYov3Wo4K8JXdxX5pJcy7s+tEaLEdXznfgGgjt9uFcRRZnsLasg9Ch5y3tnfajog3cQaZvMLi/NlRVddfH8v1wl175TKjerLWSxnLqiHushxvP+dM1VUTlzqpuZD8/5rYYtNRXgOCfvIPplv1UhjViv8pdoFtkizVYYBedlO+X1vXmISciWdiPG6ZREcJGzUFZTL4fpn6er4mwQqGdXN16Lpsbv2blc+7SfL8NExaTKT7n; lzd_sid=108de1144761220db8a1b3ec4f3d3855; _m_h5_tk=0eed2417b17557ffec7129878cb23744_1685379977546; _m_h5_tk_enc=bf0dad79f26dddd63250e7d71e0f4568; _tb_token_=fba9008553a37; t_sid=ZnaxX3Uxmr2XbTfOIbtN1nL4EFoWEXGD; utm_channel=NA; bm_sv=5662071E1A54AC7ED7E09096D4EF806D~YAAQnwqrcVuFB2aIAQAAQBVBaBNNYUHVTfW7H252VbTwWqy6HVSsrWwCibETEQQ0liYOnyxdWYwTnHBuHADoFf6UWx7lALZ3NbY4ET8AEMtpsakFkJ/pzDw91QjxrVUeOEF6umAN/tTD/mPAWUGmNPPLZ0jLwbLZsWH4RYlRfza8O/BQVzpD38dsFsAbSzhRj/4RAJioi23RAMhNmaeyIpFW/cjvFWjei4zBw6zueTW9qZP/lGqR5h+6Zv6LQaql~1; tfstk=cpWPBO2LQ8ey4EILWKvF7wNkTWvRZjIlxx-6ZrAhoXaor3Oli-gpmtj90Hiq-Qf..; l=fBjr2dWuTojiVbfXBOfwPurza77OSIRAguPzaNbMi9fP_8fp5g0hW1aR5989C3MNFspJR3l39SNMBeYBqCmLM4QoBKRS3xkmnRScdTf..; isg=BLS04uxkbJMHsv-7VdCYBfSBhXImjdh3aPJCk04VQD_CuVQDdp2oB2p3OelhQRDP", "x-csrf-token":"fba9008553a37", "accept": "application/json, text/plain, */*", "accept-language": "en-US,en;q=0.9", "x-requested-with": "XMLHttpRequest" } #proxies=proxies print(url) time.sleep(0.1) # proxiess = get_new_proxy() random_number = random.choice([0]) print(random_number) r="" if( random_number == 0): r = requests.get(url,headers=headers) else: pass # proxiesss = get_new_proxy() # print(proxiesss ) # r = requests.get(url,headers=headers, proxies = proxiesss ) try: info_item(json.loads(r.text), list_data,key1,key2,key3) # print(r.text) except: laco = 0 count_laco = 0 while(laco == 0): r = await get_cookie(url) # print(r) time.sleep(0.1) if( count_laco == 3): df.loc[index, 'chan'] = 'da thu 3 lan' df.loc[index,'sl'] =len(list_data) df.to_excel('./test.xlsx', index=False) # lỗi quá 3 lần danh dau da thu laco = 1 try: info_item(json.loads(r), list_data,key1,key2,key3) laco = 1 except: count_laco = count_laco + 1 pass async def a(key, key1, key2,key3,index,category): list_data = [] key_slug = slugify(key, separator="-", lowercase=True) for x in range(3): x = x + 1 # url ="https://www.lazada.vn/catalog/?ajax=true&isFirstRequest=true&page={}&q={}".format(x, key) # ko có danh mục trên lâzdaa url = "https://www.lazada.vn/tag/{}/?ajax=true&catalog_redirect_tag=true&page={}&q={}".format( key_slug ,x, key) # dành cho danh mục có trên lazada # url ="https://www.lazada.vn/{}/?ajax=true&catalog_redirect_tag=true&page=4&q={}".format( key_slug , key) await get_data(url,list_data,key1,key2,key3,index) print(len(list_data)) #loc ra các id khong trung unique_items = [] unique_item_ids = set() for item in list_data: if item['itemId'] not in unique_item_ids: unique_item_ids.add(item['itemId']) unique_items.append(item) if(len(list_data) >=8 and len(list_data) < 35 ): # viết hàm lưu thôi for item in unique_items: await save_db(item,category) # danh dau excel df.loc[index, 'crawl'] = 'da crawl' df.loc[index,'sl'] =len( unique_items) df.to_excel('./test.xlsx', index=False) if(len(list_data) >= 35): sorted_list = sorted( unique_items, key=lambda x: x['itemSoldCntShow'], reverse=True) print(len(sorted_list)) top_25_items = sorted_list[:35] # print(sorted_list) for item in top_35_items: await save_db(item,category) df.loc[index, 'crawl'] = 'da crawl' df.loc[index,'sl'] =len( unique_items) df.to_excel('./test.xlsx', index=False) async def duyet_excel(): # Duyệt qua từng dòng trong DataFrame for index, row in df.iterrows(): # Lấy giá trị từng cột trong dòng key = row['key'] key1 = row['key1'] key2 = row['key2'] key3 = row['key3'] category = row['test'] await a(key, key1, key2,key3,index,category ) # Tạo và chạy event loop loop = asyncio.get_event_loop() loop.run_until_complete(duyet_excel()) loop.close()