Untitled

mail@pastecode.io avatar
unknown
plain_text
2 years ago
1.6 kB
2
Indexable
Never
from selenium import webdriver
from bs4 import BeautifulSoup
import time
from selenium.webdriver.common.keys import Keys
from pdb import set_trace as bb
from selenium.webdriver.common.by import By
from multiprocessing import Pool
import random
import os
import numpy as np
from selenium.webdriver.chrome.options import Options
import sys

asset_id = sys.argv[1]
os.system('mkdir ' + asset_id)

freepick_query = "https://www.amazon.com/s?k={}&page=".format(asset_id)

#pexels_query = "https://www.pexels.com/search/fashion%20model/"
options = Options()
options.headless = True
driver = webdriver.Chrome("/usr/lib/chromium-browser/chromedriver", options=options)

for i in range(500):
    query = freepick_query + str(i+1)
    driver.get(query)

    body = driver.find_element(By.CSS_SELECTOR, 'body')
    for j in range(8):
        time.sleep(0.1)
        body.send_keys(Keys.PAGE_DOWN)

    soup = BeautifulSoup(driver.page_source,'html.parser')
    items = soup.find_all('img', class_='s-image')
    commands = []
    for img in items:
        try:
            name = img.get('alt')
            link = img.get('src')
            s = str(np.random.randint(100))
            name = name + '_' + str(s)
            name = name.replace(" ", "_") + '.jpg'
            name = name.replace('/', '')
            name = name.replace('&', 'and')
            name = name.replace("'", '')
            name = name.replace('"', '')
            link = link.replace('UL320_.jpg', 'UL2000_.jpg')
            os.system('wget -O {}/{} {}'.format(asset_id, name, link))
        except:
            continue