Untitled
unknown
plain_text
2 years ago
16 kB
7
Indexable
import csv
import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
ACCOUNTS = [
{'username': '_shapira_shapira', 'password': 'SHap200400505'},
{'username': 'dan.spinning.culture', 'password': '200400505'},
]
options = Options()
options.add_argument("-profile")
options.add_argument("/Users/ss/Library/Application Support/Firefox/Profiles/s5ojir44.shapira")
browser = webdriver.Firefox(options=options)
def switch_account():
"""Switch to a different Instagram account."""
# Log out from the current account if any
browser.get("https://www.instagram.com/")
time.sleep(5)
browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[1]/div/div/div/div/div[3]/span/div").click()
time.sleep(2)
browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[2]/div/div/div[1]/div[1]/div/div/div/div/div/div/div[1]/div/div[6]/div[1]").click()
time.sleep(2)
# Randomly pick an account from the list and login
account = random.choice(ACCOUNTS)
username = account["username"]
password = account["password"]
browser.get("https://www.instagram.com/accounts/login/")
time.sleep(2)
username_input = browser.find_element(By.NAME, "username")
password_input = browser.find_element(By.NAME, "password")
username_input.send_keys(username)
password_input.send_keys(password)
login_button = browser.find_element(By.XPATH, "//button[@type='submit']")
login_button.click()
time.sleep(5) # Let's give it a generous sleep to ensure login completes
def random_scrolling():
"""Randomly scroll the profile page."""
browser.execute_script("window.scrollTo(0, document.body.scrollHeight/3);")
time.sleep(random.uniform(0.5, 0.9))
browser.execute_script("window.scrollTo(0, 0);")
time.sleep(random.uniform(0.5, 0.9))
def visit_random_page():
"""Occasionally visit Explore and Homepage."""
pages = ["https://www.instagram.com/", "https://www.instagram.com/explore/"]
chosen_page = random.choice(pages)
try:
print(f"Visiting: {chosen_page}")
browser.get(chosen_page)
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f"Error visiting {chosen_page}. Error: {e}")
def parse_int_from_string(s):
"""Parses numbers represented as strings into integers. Handles 'K' and 'M'."""
if 'K' in s:
return int(float(s.replace('K', '')) * 1000)
elif 'M' in s:
return int(float(s.replace('M', '')) * 1000000)
else:
return int(s.replace(',', ''))
def check_profile_type():
try:
browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/div[2]")
return "public"
except NoSuchElementException:
pass
try:
browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/div/article/div/div/h2")
return "private"
except NoSuchElementException:
pass
return None
def get_profile_data(username):
retries = 0 # Number of retries
delay_between_retries = 3 # Delay in seconds
if random.random() < 0.99: # 5% likelihood to switch account on each profile visit
switch_account()
if random.random() < 0.25: # 20% likelihood
visit_random_page()
for _ in range(retries):
try:
browser.get(f'https://www.instagram.com/{username}/')
time.sleep(random.uniform(2, 5))
profile_type = check_profile_type()
if not profile_type:
raise Exception("Couldn't determine profile type (Public/Private/None).")
data_elements = [
{"name": "followers", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[2]/a/span", "type": "public"},
{"name": "following", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[3]/a/span", "type": "public"},
{"name": "num_posts", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[1]/span", "type": "public"},
{"name": "bio", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/div[3]/h1", "type": "public"},
{"name": "followers", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[2]/span/span", "type": "private"},
{"name": "following", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[3]/span/span", "type": "private"},
{"name": "num_posts", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[1]/span/span", "type": "private"},
{"name": "bio", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/div[3]/h1", "type": "private"},
]
random.shuffle(data_elements)
data = {"username": username, "followers": "Error", "following": "Error", "num_posts": "Error", "bio": "Error", "profile_type": profile_type.capitalize()}
for element in data_elements:
if element["type"] == profile_type:
try:
if "num_posts" in element["name"] or "followers" in element["name"] or "following" in element["name"]:
data[element["name"]] = parse_int_from_string(browser.find_element(By.XPATH, element["xpath"]).text)
else:
data[element["name"]] = browser.find_element(By.XPATH, element["xpath"]).text
except NoSuchElementException:
if element["name"] == "bio":
data["bio"] = "" # Set bio as an empty string if not found
continue
return [data["username"], data["followers"], data["following"], data["num_posts"], data["bio"], data["profile_type"]]
except Exception as e:
print(f"Error scraping {username} on attempt {_ + 1}: {str(e)}")
if _ < retries - 1:
print(f"Retrying in {delay_between_retries} seconds...")
time.sleep(delay_between_retries)
else:
# After all retries are exhausted, you could take a screenshot
browser.save_screenshot(f"error_{username}.png")
print(f"Failed to scrape {username} after {retries} attempts. Moving on...")
return [username, "Error", "Error", "Error", "Error", "Unknown"]
def get_already_scraped_usernames():
try:
with open('scraped_data.csv', mode='r', newline='', encoding='utf-8') as file:
reader = csv.reader(file)
already_scraped = [row[0] for row in reader]
return already_scraped
except FileNotFoundError:
return []
def scrape_usernames_from_csv():
already_scraped = get_already_scraped_usernames()
with open('usernames.csv', mode='r', newline='', encoding='utf-8') as file:
reader = csv.reader(file)
all_usernames = [row[0] for row in reader]
# Check if scraped_data.csv exists; if not, create it and write the headers.
if 'Username' not in already_scraped:
with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Username', 'Followers', 'Following', 'NumPosts', 'Bio', 'ProfileStatus'])
for username in all_usernames:
if username in already_scraped:
print(f"{username} has already been scraped. Skipping.")
continue
print(f"Scraping data for {username}")
data = get_profile_data(username)
with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(data)
print("Finished scraping all usernames!")
browser.quit()
scrape_usernames_from_csv()
import csv
import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException
options = Options()
options.add_argument("-profile")
options.add_argument("/home/cube/snap/firefox/common/.mozilla/firefox/u64wyo54.scrape")
browser = webdriver.Firefox(options=options)
def random_scrolling():
"""Randomly scroll the profile page."""
browser.execute_script("window.scrollTo(0, document.body.scrollHeight/3);")
time.sleep(random.uniform(0.5, 0.9))
browser.execute_script("window.scrollTo(0, 0);")
time.sleep(random.uniform(0.5, 0.9))
def visit_random_page():
"""Occasionally visit Explore and Homepage."""
pages = ["https://www.instagram.com/", "https://www.instagram.com/explore/"]
chosen_page = random.choice(pages)
try:
print(f"Visiting: {chosen_page}")
browser.get(chosen_page)
time.sleep(random.uniform(1, 3))
except Exception as e:
print(f"Error visiting {chosen_page}. Error: {e}")
def parse_int_from_string(s):
"""Parses numbers represented as strings into integers. Handles 'K' and 'M'."""
if 'K' in s:
return int(float(s.replace('K', '')) * 1000)
elif 'M' in s:
return int(float(s.replace('M', '')) * 1000000)
else:
return int(s.replace(',', ''))
def check_profile_type():
try:
browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/div[2]")
return "public"
except NoSuchElementException:
pass
try:
browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/div/article/div/div/h2")
return "private"
except NoSuchElementException:
pass
return None
def get_profile_data(username):
retries = 0 # Number of retries
delay_between_retries = 3 # Delay in seconds
if random.random() < 1: # 5% likelihood to switch account on each profile visit
switch_account()
if random.random() < 0.0: # 20% likelihood
visit_random_page()
for _ in range(retries):
try:
browser.get(f'https://www.instagram.com/{username}/')
time.sleep(random.uniform(2, 5))
profile_type = check_profile_type()
if not profile_type:
raise Exception("Couldn't determine profile type (Public/Private/None).")
data_elements = [
{"name": "followers", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[2]/a/span", "type": "public"},
{"name": "following", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[3]/a/span", "type": "public"},
{"name": "num_posts", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[1]/span", "type": "public"},
{"name": "bio", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/div[3]/h1", "type": "public"},
{"name": "followers", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[2]/span/span", "type": "private"},
{"name": "following", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[3]/span/span", "type": "private"},
{"name": "num_posts", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[1]/span/span", "type": "private"},
{"name": "bio", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/div[3]/h1", "type": "private"},
]
random.shuffle(data_elements)
data = {"username": username, "followers": "Error", "following": "Error", "num_posts": "Error", "bio": "Error", "profile_type": profile_type.capitalize()}
for element in data_elements:
if element["type"] == profile_type:
try:
if "num_posts" in element["name"] or "followers" in element["name"] or "following" in element["name"]:
data[element["name"]] = parse_int_from_string(browser.find_element(By.XPATH, element["xpath"]).text)
else:
data[element["name"]] = browser.find_element(By.XPATH, element["xpath"]).text
except NoSuchElementException:
if element["name"] == "bio":
data["bio"] = "" # Set bio as an empty string if not found
continue
return [data["username"], data["followers"], data["following"], data["num_posts"], data["bio"], data["profile_type"]]
except Exception as e:
print(f"Error scraping {username} on attempt {_ + 1}: {str(e)}")
if _ < retries - 1:
print(f"Retrying in {delay_between_retries} seconds...")
time.sleep(delay_between_retries)
else:
# After all retries are exhausted, you could take a screenshot
browser.save_screenshot(f"error_{username}.png")
print(f"Failed to scrape {username} after {retries} attempts. Moving on...")
return [username, "Error", "Error", "Error", "Error", "Unknown"]
def get_already_scraped_usernames():
try:
with open('scraped_data.csv', mode='r', newline='', encoding='utf-8') as file:
reader = csv.reader(file)
already_scraped = [row[0] for row in reader]
return already_scraped
except FileNotFoundError:
return []
def scrape_usernames_from_csv():
already_scraped = get_already_scraped_usernames()
with open('usernames.csv', mode='r', newline='', encoding='utf-8') as file:
reader = csv.reader(file)
all_usernames = [row[0] for row in reader]
# Check if scraped_data.csv exists; if not, create it and write the headers.
if 'Username' not in already_scraped:
with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(['Username', 'Followers', 'Following', 'NumPosts', 'Bio', 'ProfileStatus'])
for username in all_usernames:
if username in already_scraped:
print(f"{username} has already been scraped. Skipping.")
continue
print(f"Scraping data for {username}")
data = get_profile_data(username)
with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(data)
print("Finished scraping all usernames!")
browser.quit()
scrape_usernames_from_csv()Editor is loading...