Untitled
unknown
plain_text
2 years ago
16 kB
4
Indexable
import csv import random import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.firefox.options import Options from selenium.common.exceptions import NoSuchElementException ACCOUNTS = [ {'username': '_shapira_shapira', 'password': 'SHap200400505'}, {'username': 'dan.spinning.culture', 'password': '200400505'}, ] options = Options() options.add_argument("-profile") options.add_argument("/Users/ss/Library/Application Support/Firefox/Profiles/s5ojir44.shapira") browser = webdriver.Firefox(options=options) def switch_account(): """Switch to a different Instagram account.""" # Log out from the current account if any browser.get("https://www.instagram.com/") time.sleep(5) browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[1]/div/div/div/div/div[3]/span/div").click() time.sleep(2) browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[2]/div/div/div[1]/div[1]/div/div/div/div/div/div/div[1]/div/div[6]/div[1]").click() time.sleep(2) # Randomly pick an account from the list and login account = random.choice(ACCOUNTS) username = account["username"] password = account["password"] browser.get("https://www.instagram.com/accounts/login/") time.sleep(2) username_input = browser.find_element(By.NAME, "username") password_input = browser.find_element(By.NAME, "password") username_input.send_keys(username) password_input.send_keys(password) login_button = browser.find_element(By.XPATH, "//button[@type='submit']") login_button.click() time.sleep(5) # Let's give it a generous sleep to ensure login completes def random_scrolling(): """Randomly scroll the profile page.""" browser.execute_script("window.scrollTo(0, document.body.scrollHeight/3);") time.sleep(random.uniform(0.5, 0.9)) browser.execute_script("window.scrollTo(0, 0);") time.sleep(random.uniform(0.5, 0.9)) def visit_random_page(): """Occasionally visit Explore and Homepage.""" pages = ["https://www.instagram.com/", "https://www.instagram.com/explore/"] chosen_page = random.choice(pages) try: print(f"Visiting: {chosen_page}") browser.get(chosen_page) time.sleep(random.uniform(1, 3)) except Exception as e: print(f"Error visiting {chosen_page}. Error: {e}") def parse_int_from_string(s): """Parses numbers represented as strings into integers. Handles 'K' and 'M'.""" if 'K' in s: return int(float(s.replace('K', '')) * 1000) elif 'M' in s: return int(float(s.replace('M', '')) * 1000000) else: return int(s.replace(',', '')) def check_profile_type(): try: browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/div[2]") return "public" except NoSuchElementException: pass try: browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/div/article/div/div/h2") return "private" except NoSuchElementException: pass return None def get_profile_data(username): retries = 0 # Number of retries delay_between_retries = 3 # Delay in seconds if random.random() < 0.99: # 5% likelihood to switch account on each profile visit switch_account() if random.random() < 0.25: # 20% likelihood visit_random_page() for _ in range(retries): try: browser.get(f'https://www.instagram.com/{username}/') time.sleep(random.uniform(2, 5)) profile_type = check_profile_type() if not profile_type: raise Exception("Couldn't determine profile type (Public/Private/None).") data_elements = [ {"name": "followers", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[2]/a/span", "type": "public"}, {"name": "following", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[3]/a/span", "type": "public"}, {"name": "num_posts", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[1]/span", "type": "public"}, {"name": "bio", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/div[3]/h1", "type": "public"}, {"name": "followers", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[2]/span/span", "type": "private"}, {"name": "following", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[3]/span/span", "type": "private"}, {"name": "num_posts", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[1]/span/span", "type": "private"}, {"name": "bio", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/div[3]/h1", "type": "private"}, ] random.shuffle(data_elements) data = {"username": username, "followers": "Error", "following": "Error", "num_posts": "Error", "bio": "Error", "profile_type": profile_type.capitalize()} for element in data_elements: if element["type"] == profile_type: try: if "num_posts" in element["name"] or "followers" in element["name"] or "following" in element["name"]: data[element["name"]] = parse_int_from_string(browser.find_element(By.XPATH, element["xpath"]).text) else: data[element["name"]] = browser.find_element(By.XPATH, element["xpath"]).text except NoSuchElementException: if element["name"] == "bio": data["bio"] = "" # Set bio as an empty string if not found continue return [data["username"], data["followers"], data["following"], data["num_posts"], data["bio"], data["profile_type"]] except Exception as e: print(f"Error scraping {username} on attempt {_ + 1}: {str(e)}") if _ < retries - 1: print(f"Retrying in {delay_between_retries} seconds...") time.sleep(delay_between_retries) else: # After all retries are exhausted, you could take a screenshot browser.save_screenshot(f"error_{username}.png") print(f"Failed to scrape {username} after {retries} attempts. Moving on...") return [username, "Error", "Error", "Error", "Error", "Unknown"] def get_already_scraped_usernames(): try: with open('scraped_data.csv', mode='r', newline='', encoding='utf-8') as file: reader = csv.reader(file) already_scraped = [row[0] for row in reader] return already_scraped except FileNotFoundError: return [] def scrape_usernames_from_csv(): already_scraped = get_already_scraped_usernames() with open('usernames.csv', mode='r', newline='', encoding='utf-8') as file: reader = csv.reader(file) all_usernames = [row[0] for row in reader] # Check if scraped_data.csv exists; if not, create it and write the headers. if 'Username' not in already_scraped: with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(['Username', 'Followers', 'Following', 'NumPosts', 'Bio', 'ProfileStatus']) for username in all_usernames: if username in already_scraped: print(f"{username} has already been scraped. Skipping.") continue print(f"Scraping data for {username}") data = get_profile_data(username) with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(data) print("Finished scraping all usernames!") browser.quit() scrape_usernames_from_csv() import csv import random import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.firefox.options import Options from selenium.common.exceptions import NoSuchElementException options = Options() options.add_argument("-profile") options.add_argument("/home/cube/snap/firefox/common/.mozilla/firefox/u64wyo54.scrape") browser = webdriver.Firefox(options=options) def random_scrolling(): """Randomly scroll the profile page.""" browser.execute_script("window.scrollTo(0, document.body.scrollHeight/3);") time.sleep(random.uniform(0.5, 0.9)) browser.execute_script("window.scrollTo(0, 0);") time.sleep(random.uniform(0.5, 0.9)) def visit_random_page(): """Occasionally visit Explore and Homepage.""" pages = ["https://www.instagram.com/", "https://www.instagram.com/explore/"] chosen_page = random.choice(pages) try: print(f"Visiting: {chosen_page}") browser.get(chosen_page) time.sleep(random.uniform(1, 3)) except Exception as e: print(f"Error visiting {chosen_page}. Error: {e}") def parse_int_from_string(s): """Parses numbers represented as strings into integers. Handles 'K' and 'M'.""" if 'K' in s: return int(float(s.replace('K', '')) * 1000) elif 'M' in s: return int(float(s.replace('M', '')) * 1000000) else: return int(s.replace(',', '')) def check_profile_type(): try: browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/div[2]") return "public" except NoSuchElementException: pass try: browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/div/article/div/div/h2") return "private" except NoSuchElementException: pass return None def get_profile_data(username): retries = 0 # Number of retries delay_between_retries = 3 # Delay in seconds if random.random() < 1: # 5% likelihood to switch account on each profile visit switch_account() if random.random() < 0.0: # 20% likelihood visit_random_page() for _ in range(retries): try: browser.get(f'https://www.instagram.com/{username}/') time.sleep(random.uniform(2, 5)) profile_type = check_profile_type() if not profile_type: raise Exception("Couldn't determine profile type (Public/Private/None).") data_elements = [ {"name": "followers", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[2]/a/span", "type": "public"}, {"name": "following", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[3]/a/span", "type": "public"}, {"name": "num_posts", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[1]/span", "type": "public"}, {"name": "bio", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/div[3]/h1", "type": "public"}, {"name": "followers", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[2]/span/span", "type": "private"}, {"name": "following", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[3]/span/span", "type": "private"}, {"name": "num_posts", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[1]/span/span", "type": "private"}, {"name": "bio", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/div[3]/h1", "type": "private"}, ] random.shuffle(data_elements) data = {"username": username, "followers": "Error", "following": "Error", "num_posts": "Error", "bio": "Error", "profile_type": profile_type.capitalize()} for element in data_elements: if element["type"] == profile_type: try: if "num_posts" in element["name"] or "followers" in element["name"] or "following" in element["name"]: data[element["name"]] = parse_int_from_string(browser.find_element(By.XPATH, element["xpath"]).text) else: data[element["name"]] = browser.find_element(By.XPATH, element["xpath"]).text except NoSuchElementException: if element["name"] == "bio": data["bio"] = "" # Set bio as an empty string if not found continue return [data["username"], data["followers"], data["following"], data["num_posts"], data["bio"], data["profile_type"]] except Exception as e: print(f"Error scraping {username} on attempt {_ + 1}: {str(e)}") if _ < retries - 1: print(f"Retrying in {delay_between_retries} seconds...") time.sleep(delay_between_retries) else: # After all retries are exhausted, you could take a screenshot browser.save_screenshot(f"error_{username}.png") print(f"Failed to scrape {username} after {retries} attempts. Moving on...") return [username, "Error", "Error", "Error", "Error", "Unknown"] def get_already_scraped_usernames(): try: with open('scraped_data.csv', mode='r', newline='', encoding='utf-8') as file: reader = csv.reader(file) already_scraped = [row[0] for row in reader] return already_scraped except FileNotFoundError: return [] def scrape_usernames_from_csv(): already_scraped = get_already_scraped_usernames() with open('usernames.csv', mode='r', newline='', encoding='utf-8') as file: reader = csv.reader(file) all_usernames = [row[0] for row in reader] # Check if scraped_data.csv exists; if not, create it and write the headers. if 'Username' not in already_scraped: with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(['Username', 'Followers', 'Following', 'NumPosts', 'Bio', 'ProfileStatus']) for username in all_usernames: if username in already_scraped: print(f"{username} has already been scraped. Skipping.") continue print(f"Scraping data for {username}") data = get_profile_data(username) with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(data) print("Finished scraping all usernames!") browser.quit() scrape_usernames_from_csv()
Editor is loading...