import json
import time
from slugify import slugify
import threading
import random
import asyncio
import math
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
import requests
PROXY_HOST = '' # rotating proxy or host
PROXY_PORT = ''# port
PROXY_USER = '' # username
PROXY_PASS = '' # password
manifest_json=""
background_js =""
import pandas as pd
import requests
import pymysql
import os
import zipfile
# Đường dẫn tới tệp Excel
excel_file = './test.xlsx'
import aiohttp
import asyncio
import json
connection_data=pymysql.connect(host="103.110.84.6", user="pdjtvluc_tuan", passwd="lazadasp1", db="pdjtvluc_lazadadb",port=3306)
async def save_db(item,category_sp):
# print(item)
is_load = 0;
stock = 0
conn = connection_data
cursor = conn.cursor()
conn.ping() # reconnecting mysql
with conn.cursor() as cursor:
sql = """
INSERT INTO product(product_id,name ,discount,price,price_min,price_max,typer_shop,historical_sold,image,images,liked_count,
link,rating,shop_location,stock,category,is_load
) VALUE(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
val =(int(item['itemId']),str(item['name']), float(item['discount']), float(item['price']),
float(item['price']), float(item['originalPrice']), str(item['Shopmall']),
float(item['itemSoldCntShow']), str(item['image']),str(item['images']),str(item['review']),
str(item['itemUrl']), float(item['ratingScore']), str(item['location']),
stock,category_sp, is_load,
)
cursor.execute(sql,val)
conn.commit()
conn.close()
# Đọc tệp Excel
df = pd.read_excel(excel_file)
path = "C:/Users/Admin/Desktop/lazada/backend/tool/chromedriver.exe"
async def fetch(url):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
return await response.text()
async def get_proxy_sele():
global PROXY_HOST
global PROXY_PORT
global PROXY_USER
global PROXY_PASS
location ="vn_dn"
random_number = random.choice([0, 1])
if( random_number == 0):
location ="vn_dn"
else:
location ="vn_hcm"
server_host_s = -1
count = 0
while(server_host_s == -1):
if(count >= 2):
print("dang ngu dong")
time.sleep(60)
count = count + 1
if(count >= 4):
count = 0
try:
url = "https://api.tinproxy.com/proxy/get-new-proxy?authen_ips=14.191.157.215&location={}&api_key=70QpHJ5sKS3i7kgdJtuKrdh9M0BsHXbK".format(location)
r = await fetch(url)
data = json.loads(r)
server_host = data['data']['http_ipv6_ipv4']
user_name= data['data']['authentication']['username']
pass_word = data['data']['authentication']['password']
ip, port = server_host.split(":")
# print("Địa chỉ IP:", ip)
# print("Cổng:", port)
PROXY_HOST = ip
PROXY_PORT = port
PROXY_USER = user_name
PROXY_PASS = pass_word
my_proxies2 = {
"PROXY_HOST":ip,
"PROXY_PORT":port,
"PROXY_USER": user_name,
"PROXY_PASS":pass_word ,
}
if( server_host != None or server_host != ""):
server_host_s = 1
except:
server_host_s = -1
count = count + 1
print("get proxy cho selenium",my_proxies2)
return my_proxies2
async def getmb():
print("lay proxy")
await get_proxy_sele()
time.sleep(1)
global manifest_json
global background_js
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": [
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"<all_urls>",
"webRequest",
"webRequestBlocking"
],
"background": {
"scripts": ["background.js"]
},
"minimum_chrome_version":"22.0.0"
}
"""
background_js = """
var config = {
mode: "fixed_servers",
rules: {
singleProxy: {
scheme: "http",
host: "%s",
port: parseInt(%s)
},
bypassList: ["localhost"]
}
};
chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
function callbackFn(details) {
return {
authCredentials: {
username: "%s",
password: "%s"
}
};
}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{urls: ["<all_urls>"]},
['blocking']
);
""" % (PROXY_HOST, PROXY_PORT, PROXY_USER, PROXY_PASS)
def get_chromedriver2(use_proxy=True, user_agent=None):
path = os.path.dirname(os.path.abspath(__file__))
chrome_options = webdriver.ChromeOptions()
pluginfile = 'proxy_auth_plugin.zip'
# Kiểm tra xem tệp đã tồn tại hay không
if os.path.exists(pluginfile):
# Xóa tệp nếu đã tồn tại
os.remove(pluginfile)
print("Đã xóa tệp", pluginfile)
time.sleep(0.2)
if use_proxy:
pluginfile = 'proxy_auth_plugin.zip'
with zipfile.ZipFile(pluginfile, 'w') as zp:
zp.writestr("manifest.json", manifest_json)
zp.writestr("background.js", background_js)
chrome_options.add_extension(pluginfile)
if user_agent:
chrome_options.add_argument('--user-agent=%s' % user_agent)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_experimental_option(
"excludeSwitches",
"""
ignore-certificate-errors
safebrowsing-disable-download-protection
safebrowsing-disable-auto-update
disable-client-side-phishing-detection
""".split()
)
driver = webdriver.Chrome(
os.path.join(path, 'chromedriver'),
chrome_options=chrome_options)
set_device_metrics_override = dict({
"width": 375,
"height": 812,
"deviceScaleFactor": 50,
"mobile": True
})
# driver.execute_cdp_cmd('Emulation.setDeviceMetricsOverride', set_device_metrics_override)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
# driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
return driver
def get_chromedriver(use_proxy=True, user_agent=None):
path = os.path.dirname(os.path.abspath(__file__))
chrome_options = webdriver.ChromeOptions()
pluginfile = 'proxy_auth_plugin.zip'
time.sleep(0.2)
if use_proxy:
pluginfile = 'proxy_auth_plugin.zip'
with zipfile.ZipFile(pluginfile, 'w') as zp:
zp.writestr("manifest.json", manifest_json)
zp.writestr("background.js", background_js)
chrome_options.add_extension(pluginfile)
if user_agent:
chrome_options.add_argument('--user-agent=%s' % user_agent)
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_experimental_option(
"excludeSwitches",
"""
ignore-certificate-errors
safebrowsing-disable-download-protection
safebrowsing-disable-auto-update
disable-client-side-phishing-detection
""".split()
)
driver = webdriver.Chrome(
os.path.join(path, 'chromedriver'),
chrome_options=chrome_options)
set_device_metrics_override = dict({
"width": 375,
"height": 812,
"deviceScaleFactor": 50,
"mobile": True
})
# driver.execute_cdp_cmd('Emulation.setDeviceMetricsOverride', set_device_metrics_override)
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
# driver.execute_cdp_cmd('Network.setUserAgentOverride', {"userAgent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.53 Safari/537.36'})
return driver
async def get_cookie(url2):
counts = 0
# lan 1
if(counts == 0):
await getmb()
driver = get_chromedriver(use_proxy=True)
driver.set_window_rect(100,200,375,812)
url ="https://www.lazada.vn/dien-thoai-di-dong/"
driver.get(url)
driver.execute_script("window.open('{}', '_blank')".format(url2))
driver.switch_to.window(driver.window_handles[-1]) # Chuyển sang tab mới mở
try:
r=(driver.find_element("xpath",("/html/body/pre")).text)
except:
driver.close()
driver.switch_to.window(driver.window_handles[-1])
xc = 0
while(xc == 0):
time.sleep(0.3)
counts = 1
time.sleep(0.5)
await getmb()
time.sleep(0.5)
driver = get_chromedriver2(use_proxy=True)
driver.set_window_rect(100,200,375,812)
url ="https://www.lazada.vn/dien-thoai-di-dong/"
driver.get(url)
try:
driver.execute_script("window.open('{}', '_blank')".format(url2))
r=(driver.find_element("xpath",("/html/body/pre")).text)
time.sleep(0.2)
xc = 1
break
except:
xc = 0
driver.switch_to.window(driver.window_handles[-1]) # Chuyển sang tab mới mở
driver.close()
driver.switch_to.window(driver.window_handles[-1])
driver.close()
time.sleep(0.2)
return r
def get_new_proxy():
r = requests.get("https://api.tinproxy.com/proxy/get-new-proxy?authen_ips=103.107.183.30&location=vn_hcm&api_key=70QpHJ5sKS3i7kgdJtuKrdh9M0BsHXbK")
data = json.loads(r.text)
try:
server_host = data['data']['http_ipv6_ipv4']
user_name= data['data']['authentication']['username']
pass_word = data['data']['authentication']['password']
my_proxies = {
"http" : "http://{}:{}@{}".format(user_name,pass_word,server_host),
}
print(my_proxies)
except:
server_host = -1
while(server_host == -1):
time.sleep(30)
r = requests.get("https://api.tinproxy.com/proxy/get-new-proxy?authen_ips=103.107.183.30&location=vn_hcm&api_key=70QpHJ5sKS3i7kgdJtuKrdh9M0BsHXbK")
data = json.loads(r.text)
try:
server_host = data['data']['http_ipv6_ipv4']
user_name= data['data']['authentication']['username']
pass_word = data['data']['authentication']['password']
my_proxies = {
"http" : "http://{}:{}@{}".format(user_name,pass_word,server_host),
}
except:
pass
return my_proxies
def info_item(data,list_item,key1,key2,key3):
items = data['mods']['listItems']
brandId = -1
brandName = ""
discount = -1
image = ""
itemId = -1
itemSoldCntShow=""
itemUrl = ""
nid = -1
originalPrice = 0
originalPriceShow = ""
price = ""
priceShow = 0
ratingScor = 0
review = ""
sellerId= 0
sellerName = ""
itemUrl =""
sku =""
skuId=""
Shopmall = ""
itemId = -1
images = []
for item in items:
brandId = item['brandId']
brandName = item['brandName']
itemId = item['itemId']
try:
discount = item['discount']
if(discount != None):
discount = discount.split("%")[0]
except:
discount = 0
image = item['image']
itemId = item['itemId']
try:
itemSoldCntShow = item['itemSoldCntShow']
try:
itemSoldCntShow = itemSoldCntShow[: itemSoldCntShow.index("sold")].strip()
except:
pass
try:
itemSoldCntShow = itemSoldCntShow[: itemSoldCntShow.index("Đã bán")].strip()
except:
pass
try:
itemSoldCntShow = itemSoldCntShow[: itemSoldCntShow.index("+ Đã bán")].strip()
except:
pass
try:
itemSoldCntShow = itemSoldCntShow[: itemSoldCntShow.index("+")].strip()
except:
pass
try:
itemSoldCntShow = itemSoldCntShow[: itemSoldCntShow.index("k")].strip()
except:
pass
itemSoldCntShow = itemSoldCntShow.replace(",",".")
except:
itemSoldCntShow = 0
itemUrl = item['itemUrl']
location = item['location']
name = item['name']
nid = item['nid']
try:
originalPrice = item['originalPrice']
except:
pass
try:
originalPriceShow = item['originalPriceShow']
except:
pass
try:
price = item['price']
except:
pass
try:
priceShow = item['priceShow']
ratingScore = item['ratingScore']
except:
pass
try:
review = item['review']
except:
pass
sellerId = item['sellerId']
sellerName = item['sellerName']
sku = item['sku']
skuId = item['skuId']
try:
Shopmall = item['icons'][0]['alias']
except:
pass
img_data = item['thumbs'];
# print(img_data)
try:
for item_img in img_data:
# print(item_img['image'])
images.append(item_img['image'])
except:
pass
name_new = name.lower()
key1_new = key1.lower()
key2_new = key2.lower()
key3_new = key3
if(key3_new != "x"):
# print("lay 2 dieu kien")
time.sleep(0.1)
if(key1_new in name_new and key2_new in name_new):
# print(itemId)
# print(name)
# print("-=----")
list_item.append({
"key": itemId,
"itemId":itemId,
"brandId":brandId,
"brandName":brandName,
"discount": discount,
"image":image ,
"itemId":itemId,
"itemSoldCntShow":float(itemSoldCntShow),
"itemUrl": itemUrl ,
"location":location,
"name": name,
"nid": nid,
# price max
"originalPrice":originalPrice,
"originalPriceShow": originalPriceShow,
"price": float(price),
# price min
"priceShow":priceShow,
"ratingScore":ratingScore,
"review": review,
"sellerId": sellerId,
"Shopmall": Shopmall,
"images":images
})
else:
# print("lay 3 dieu kien")
key3_new = key3.lower()
if(key1_new in name_new and key2_new in name_new and key3_new in name_new):
# print(itemId)
# print(name)
# print("-=----")
list_item.append({
"key": itemId,
"itemId":itemId,
"brandId":brandId,
"brandName":brandName,
"discount": discount,
"image":image ,
"itemId":itemId,
"itemSoldCntShow": float(itemSoldCntShow),
"itemUrl": itemUrl ,
"location":location,
"name": name,
"nid": nid,
# price max
"originalPrice":originalPrice,
"originalPriceShow": originalPriceShow,
"price": float(price),
# price min
"priceShow":priceShow,
"ratingScore":ratingScore,
"review": review,
"sellerId": sellerId,
"Shopmall": Shopmall,
"images":images
})
images = []
async def get_data(url,list_data,key1,key2,key3,index):
#Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1
#Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36
headers = {
"cookie":"__wpkreporterwid_=813ed48b-595c-4adf-9f22-24f1b8328e2a; t_fv=1670425558316; t_uid=ZfqbNVCVcuji2pNujFDgD7X8IpYsozLZ; cna=1pkXHJ5Pty8CAXRvHRQmtaHQ; lzd_cid=96503b91-03f6-4a84-a33d-a69b147980af; miidlaz=miidgg5rbu1gmghens6gt8p; lwrid=AQGHIpP%2FCbpv79mAoewzX39uI4vP; _bl_uid=g6l2nf8gqRFoUCf52j0wg9ktCRb8; _gcl_au=1.1.1020666756.1680854480; _ga=GA1.2.303867327.1681100355; cto_axid=qVhKOIlvdz8gBvNCGiZJpvO9HK21W2gI; pdp_sfo=1; dsa_category_disclaimer=true; _gcl_aw=GCL.1684482945.CjwKCAjwvJyjBhApEiwAWz2nLbwA1YN9Bte0-xwC9cjeLL5W8bOre5Flpn6stpzXeN396SwHoXCH5BoCSaQQAvD_BwE; lzd_click_id=clkgl3rhh1h18ddqulh34f; _uetvid=a70580a0d8e511ed8ba31b9e895a65ea; cto_bundle=o16OV19JR2tsbmZUQjNSMlZXdU1nJTJCSnlJTkdsa0FqeHpOZjgwOHU5QyUyQkFzZzRFSU9MYmNwb1h0anY3bWozS1BIaiUyRkxQUnRoS3kzZURzazZKUnBiNjRMZ0hNSWJxVlNEYzc5QnFDNCUyQkswMXBNVTVQaTVEbXlYaDQ5SGxjSnZMS3ZMTWZOWGc1Nml6TE8wRGRkZmRlJTJCWlpYNkd3JTNEJTNE; AMCV_126E248D54200F960A4C98C6%40AdobeOrg=-1124106680%7CMCIDTS%7C19504%7CMCMID%7C52344899913288371074101774141992786277%7CMCAAMLH-1685675655%7C3%7CMCAAMB-1685675655%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1685078055s%7CNONE%7CvVersion%7C5.2.0; sgcookie=E100QFP%2BvohEo9oGTPL5SxXeeRlcKciHrMGTIHpFzH6Ej%2F7suAYVHgFLAVoD6SK81cStjX%2B6v7paxaoNBBGj4OBHTs%2FcXoQaMr2fPRFTDhLnV2A%3D; hng=VN|en|VND|704; userLanguageML=en; xlly_s=1; EGG_SESS=S_Gs1wHo9OvRHCMp98md7DUZe1_Oi7MP7VJBGrkx9fhSS7nQTnQJeqqXEBD6IWifQZY9APBbZ1NYv6XiRfJqEz8D_8i-At2bdf63doGKaAyOxhfxDvfM3CbBdWPewHrMN0lq3c1joYuwyaSJfxbUY4Skd9BkzmJRNg2pqZfhI3M=; ak_bmsc=366442E74FE13AEE7E9E22CBF35E72F3~000000000000000000000000000000~YAAQnwqrcV0uBmaIAQAAw7vqZxOadtWFkCIDKA3lr2K8jKVeIC2TUkasu4H6eKqmQuWeXkL9exGezkGcDBTQ0Osjf3qxB8vWzlGcSI9HHkP+1rA/LyLzgplq5luPovmO/seeTA9FpIpICoYIvBbPi9q6hpL0TfXYov3Wo4K8JXdxX5pJcy7s+tEaLEdXznfgGgjt9uFcRRZnsLasg9Ch5y3tnfajog3cQaZvMLi/NlRVddfH8v1wl175TKjerLWSxnLqiHushxvP+dM1VUTlzqpuZD8/5rYYtNRXgOCfvIPplv1UhjViv8pdoFtkizVYYBedlO+X1vXmISciWdiPG6ZREcJGzUFZTL4fpn6er4mwQqGdXN16Lpsbv2blc+7SfL8NExaTKT7n; lzd_sid=108de1144761220db8a1b3ec4f3d3855; _m_h5_tk=0eed2417b17557ffec7129878cb23744_1685379977546; _m_h5_tk_enc=bf0dad79f26dddd63250e7d71e0f4568; _tb_token_=fba9008553a37; t_sid=ZnaxX3Uxmr2XbTfOIbtN1nL4EFoWEXGD; utm_channel=NA; bm_sv=5662071E1A54AC7ED7E09096D4EF806D~YAAQnwqrcVuFB2aIAQAAQBVBaBNNYUHVTfW7H252VbTwWqy6HVSsrWwCibETEQQ0liYOnyxdWYwTnHBuHADoFf6UWx7lALZ3NbY4ET8AEMtpsakFkJ/pzDw91QjxrVUeOEF6umAN/tTD/mPAWUGmNPPLZ0jLwbLZsWH4RYlRfza8O/BQVzpD38dsFsAbSzhRj/4RAJioi23RAMhNmaeyIpFW/cjvFWjei4zBw6zueTW9qZP/lGqR5h+6Zv6LQaql~1; tfstk=cpWPBO2LQ8ey4EILWKvF7wNkTWvRZjIlxx-6ZrAhoXaor3Oli-gpmtj90Hiq-Qf..; l=fBjr2dWuTojiVbfXBOfwPurza77OSIRAguPzaNbMi9fP_8fp5g0hW1aR5989C3MNFspJR3l39SNMBeYBqCmLM4QoBKRS3xkmnRScdTf..; isg=BLS04uxkbJMHsv-7VdCYBfSBhXImjdh3aPJCk04VQD_CuVQDdp2oB2p3OelhQRDP",
"x-csrf-token":"fba9008553a37",
"accept": "application/json, text/plain, */*",
"accept-language": "en-US,en;q=0.9",
"x-requested-with": "XMLHttpRequest"
}
#proxies=proxies
print(url)
time.sleep(0.1)
# proxiess = get_new_proxy()
random_number = random.choice([0])
print(random_number)
r=""
if( random_number == 0):
r = requests.get(url,headers=headers)
else:
pass
# proxiesss = get_new_proxy()
# print(proxiesss )
# r = requests.get(url,headers=headers, proxies = proxiesss )
try:
info_item(json.loads(r.text), list_data,key1,key2,key3)
# print(r.text)
except:
laco = 0
count_laco = 0
while(laco == 0):
r = await get_cookie(url)
# print(r)
time.sleep(0.1)
if( count_laco == 3):
df.loc[index, 'chan'] = 'da thu 3 lan'
df.loc[index,'sl'] =len(list_data)
df.to_excel('./test.xlsx', index=False)
# lỗi quá 3 lần danh dau da thu
laco = 1
try:
info_item(json.loads(r), list_data,key1,key2,key3)
laco = 1
except:
count_laco = count_laco + 1
pass
async def a(key, key1, key2,key3,index,category):
list_data = []
key_slug = slugify(key, separator="-", lowercase=True)
for x in range(3):
x = x + 1
# url ="https://www.lazada.vn/catalog/?ajax=true&isFirstRequest=true&page={}&q={}".format(x, key)
# ko có danh mục trên lâzdaa
url = "https://www.lazada.vn/tag/{}/?ajax=true&catalog_redirect_tag=true&page={}&q={}".format( key_slug ,x, key)
# dành cho danh mục có trên lazada
# url ="https://www.lazada.vn/{}/?ajax=true&catalog_redirect_tag=true&page=4&q={}".format( key_slug , key)
await get_data(url,list_data,key1,key2,key3,index)
print(len(list_data))
#loc ra các id khong trung
unique_items = []
unique_item_ids = set()
for item in list_data:
if item['itemId'] not in unique_item_ids:
unique_item_ids.add(item['itemId'])
unique_items.append(item)
if(len(list_data) >=8 and len(list_data) < 35 ):
# viết hàm lưu thôi
for item in unique_items:
await save_db(item,category)
# danh dau excel
df.loc[index, 'crawl'] = 'da crawl'
df.loc[index,'sl'] =len( unique_items)
df.to_excel('./test.xlsx', index=False)
if(len(list_data) >= 35):
sorted_list = sorted( unique_items, key=lambda x: x['itemSoldCntShow'], reverse=True)
print(len(sorted_list))
top_25_items = sorted_list[:35]
# print(sorted_list)
for item in top_35_items:
await save_db(item,category)
df.loc[index, 'crawl'] = 'da crawl'
df.loc[index,'sl'] =len( unique_items)
df.to_excel('./test.xlsx', index=False)
async def duyet_excel():
# Duyệt qua từng dòng trong DataFrame
for index, row in df.iterrows():
# Lấy giá trị từng cột trong dòng
key = row['key']
key1 = row['key1']
key2 = row['key2']
key3 = row['key3']
category = row['test']
await a(key, key1, key2,key3,index,category )
# Tạo và chạy event loop
loop = asyncio.get_event_loop()
loop.run_until_complete(duyet_excel())
loop.close()