Untitled

 avatar
unknown
plain_text
2 years ago
16 kB
4
Indexable
import csv
import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException

ACCOUNTS = [
    {'username': '_shapira_shapira', 'password': 'SHap200400505'}, 
    {'username': 'dan.spinning.culture', 'password': '200400505'},  
]

options = Options()
options.add_argument("-profile")
options.add_argument("/Users/ss/Library/Application Support/Firefox/Profiles/s5ojir44.shapira")
browser = webdriver.Firefox(options=options)

def switch_account():
    """Switch to a different Instagram account."""
    # Log out from the current account if any
    browser.get("https://www.instagram.com/")
    time.sleep(5)
    browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[1]/div/div/div/div/div[3]/span/div").click()
    time.sleep(2)
    browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[2]/div/div/div[1]/div[1]/div/div/div/div/div/div/div[1]/div/div[6]/div[1]").click()
    time.sleep(2)

    # Randomly pick an account from the list and login
    account = random.choice(ACCOUNTS)
    username = account["username"]
    password = account["password"]
    
    browser.get("https://www.instagram.com/accounts/login/")
    time.sleep(2)

    username_input = browser.find_element(By.NAME, "username")
    password_input = browser.find_element(By.NAME, "password")

    username_input.send_keys(username)
    password_input.send_keys(password)

    login_button = browser.find_element(By.XPATH, "//button[@type='submit']")
    login_button.click()

    time.sleep(5)  # Let's give it a generous sleep to ensure login completes


def random_scrolling():
    """Randomly scroll the profile page."""
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight/3);")
    time.sleep(random.uniform(0.5, 0.9))
    browser.execute_script("window.scrollTo(0, 0);")
    time.sleep(random.uniform(0.5, 0.9))

def visit_random_page():
    """Occasionally visit Explore and Homepage."""
    pages = ["https://www.instagram.com/", "https://www.instagram.com/explore/"]
    chosen_page = random.choice(pages)
    try:
        print(f"Visiting: {chosen_page}")
        browser.get(chosen_page)
        time.sleep(random.uniform(1, 3))
    except Exception as e:
        print(f"Error visiting {chosen_page}. Error: {e}")

def parse_int_from_string(s):
    """Parses numbers represented as strings into integers. Handles 'K' and 'M'."""
    if 'K' in s:
        return int(float(s.replace('K', '')) * 1000)
    elif 'M' in s:
        return int(float(s.replace('M', '')) * 1000000)
    else:
        return int(s.replace(',', ''))

def check_profile_type():
    try:
        browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/div[2]")
        return "public"
    except NoSuchElementException:
        pass

    try:
        browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/div/article/div/div/h2")
        return "private"
    except NoSuchElementException:
        pass

    return None
    
def get_profile_data(username):
    retries = 0  # Number of retries
    delay_between_retries = 3  # Delay in seconds

    if random.random() < 0.99:  # 5% likelihood to switch account on each profile visit
        switch_account()

    if random.random() < 0.25:  # 20% likelihood
        visit_random_page()

    for _ in range(retries):
        try:
            browser.get(f'https://www.instagram.com/{username}/')
            time.sleep(random.uniform(2, 5))

            profile_type = check_profile_type()
            if not profile_type:
                raise Exception("Couldn't determine profile type (Public/Private/None).")

            data_elements = [
                {"name": "followers", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[2]/a/span", "type": "public"},
                {"name": "following", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[3]/a/span", "type": "public"},
                {"name": "num_posts", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[1]/span", "type": "public"},
                {"name": "bio", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/div[3]/h1", "type": "public"},
                {"name": "followers", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[2]/span/span", "type": "private"},
                {"name": "following", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[3]/span/span", "type": "private"},
                {"name": "num_posts", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[1]/span/span", "type": "private"},
                {"name": "bio", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/div[3]/h1", "type": "private"},
            ]

            random.shuffle(data_elements)

            data = {"username": username, "followers": "Error", "following": "Error", "num_posts": "Error", "bio": "Error", "profile_type": profile_type.capitalize()}

            for element in data_elements:
                if element["type"] == profile_type:
                    try:
                        if "num_posts" in element["name"] or "followers" in element["name"] or "following" in element["name"]:
                            data[element["name"]] = parse_int_from_string(browser.find_element(By.XPATH, element["xpath"]).text)
                        else:
                            data[element["name"]] = browser.find_element(By.XPATH, element["xpath"]).text
                    except NoSuchElementException:
                        if element["name"] == "bio":
                            data["bio"] = ""  # Set bio as an empty string if not found
                        continue


            return [data["username"], data["followers"], data["following"], data["num_posts"], data["bio"], data["profile_type"]]

        except Exception as e:
            print(f"Error scraping {username} on attempt {_ + 1}: {str(e)}")
            if _ < retries - 1:
                print(f"Retrying in {delay_between_retries} seconds...")
                time.sleep(delay_between_retries)
            else:
                # After all retries are exhausted, you could take a screenshot
                browser.save_screenshot(f"error_{username}.png")
                print(f"Failed to scrape {username} after {retries} attempts. Moving on...")

    return [username, "Error", "Error", "Error", "Error", "Unknown"]

def get_already_scraped_usernames():
    try:
        with open('scraped_data.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            already_scraped = [row[0] for row in reader]
        return already_scraped
    except FileNotFoundError:
        return []

def scrape_usernames_from_csv():
    already_scraped = get_already_scraped_usernames()

    with open('usernames.csv', mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        all_usernames = [row[0] for row in reader]

    # Check if scraped_data.csv exists; if not, create it and write the headers.
    if 'Username' not in already_scraped:
        with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Username', 'Followers', 'Following', 'NumPosts', 'Bio', 'ProfileStatus'])
        
    for username in all_usernames:
        if username in already_scraped:
            print(f"{username} has already been scraped. Skipping.")
            continue

        print(f"Scraping data for {username}")
        data = get_profile_data(username)
        
        with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(data)

    print("Finished scraping all usernames!")
    browser.quit()

scrape_usernames_from_csv()
import csv
import random
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import NoSuchElementException

options = Options()
options.add_argument("-profile")
options.add_argument("/home/cube/snap/firefox/common/.mozilla/firefox/u64wyo54.scrape")
browser = webdriver.Firefox(options=options)

def random_scrolling():
    """Randomly scroll the profile page."""
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight/3);")
    time.sleep(random.uniform(0.5, 0.9))
    browser.execute_script("window.scrollTo(0, 0);")
    time.sleep(random.uniform(0.5, 0.9))

def visit_random_page():
    """Occasionally visit Explore and Homepage."""
    pages = ["https://www.instagram.com/", "https://www.instagram.com/explore/"]
    chosen_page = random.choice(pages)
    try:
        print(f"Visiting: {chosen_page}")
        browser.get(chosen_page)
        time.sleep(random.uniform(1, 3))
    except Exception as e:
        print(f"Error visiting {chosen_page}. Error: {e}")

def parse_int_from_string(s):
    """Parses numbers represented as strings into integers. Handles 'K' and 'M'."""
    if 'K' in s:
        return int(float(s.replace('K', '')) * 1000)
    elif 'M' in s:
        return int(float(s.replace('M', '')) * 1000000)
    else:
        return int(s.replace(',', ''))

def check_profile_type():
    try:
        browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/div[2]")
        return "public"
    except NoSuchElementException:
        pass

    try:
        browser.find_element(By.XPATH, "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/div/article/div/div/h2")
        return "private"
    except NoSuchElementException:
        pass

    return None
    
def get_profile_data(username):
    retries = 0  # Number of retries
    delay_between_retries = 3  # Delay in seconds

    if random.random() < 1:  # 5% likelihood to switch account on each profile visit
        switch_account()

    if random.random() < 0.0:  # 20% likelihood
        visit_random_page()

    for _ in range(retries):
        try:
            browser.get(f'https://www.instagram.com/{username}/')
            time.sleep(random.uniform(2, 5))

            profile_type = check_profile_type()
            if not profile_type:
                raise Exception("Couldn't determine profile type (Public/Private/None).")

            data_elements = [
                {"name": "followers", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[2]/a/span", "type": "public"},
                {"name": "following", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[3]/a/span", "type": "public"},
                {"name": "num_posts", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[1]/span", "type": "public"},
                {"name": "bio", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/div[3]/h1", "type": "public"},
                {"name": "followers", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[2]/span/span", "type": "private"},
                {"name": "following", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[3]/span/span", "type": "private"},
                {"name": "num_posts", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/ul/li[1]/span/span", "type": "private"},
                {"name": "bio", "xpath": "/html/body/div[2]/div/div/div[2]/div/div/div/div[1]/div[1]/div[2]/div[2]/section/main/div/header/section/div[3]/h1", "type": "private"},
            ]

            random.shuffle(data_elements)

            data = {"username": username, "followers": "Error", "following": "Error", "num_posts": "Error", "bio": "Error", "profile_type": profile_type.capitalize()}

            for element in data_elements:
                if element["type"] == profile_type:
                    try:
                        if "num_posts" in element["name"] or "followers" in element["name"] or "following" in element["name"]:
                            data[element["name"]] = parse_int_from_string(browser.find_element(By.XPATH, element["xpath"]).text)
                        else:
                            data[element["name"]] = browser.find_element(By.XPATH, element["xpath"]).text
                    except NoSuchElementException:
                        if element["name"] == "bio":
                            data["bio"] = ""  # Set bio as an empty string if not found
                        continue


            return [data["username"], data["followers"], data["following"], data["num_posts"], data["bio"], data["profile_type"]]

        except Exception as e:
            print(f"Error scraping {username} on attempt {_ + 1}: {str(e)}")
            if _ < retries - 1:
                print(f"Retrying in {delay_between_retries} seconds...")
                time.sleep(delay_between_retries)
            else:
                # After all retries are exhausted, you could take a screenshot
                browser.save_screenshot(f"error_{username}.png")
                print(f"Failed to scrape {username} after {retries} attempts. Moving on...")

    return [username, "Error", "Error", "Error", "Error", "Unknown"]

def get_already_scraped_usernames():
    try:
        with open('scraped_data.csv', mode='r', newline='', encoding='utf-8') as file:
            reader = csv.reader(file)
            already_scraped = [row[0] for row in reader]
        return already_scraped
    except FileNotFoundError:
        return []

def scrape_usernames_from_csv():
    already_scraped = get_already_scraped_usernames()

    with open('usernames.csv', mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        all_usernames = [row[0] for row in reader]

    # Check if scraped_data.csv exists; if not, create it and write the headers.
    if 'Username' not in already_scraped:
        with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(['Username', 'Followers', 'Following', 'NumPosts', 'Bio', 'ProfileStatus'])
        
    for username in all_usernames:
        if username in already_scraped:
            print(f"{username} has already been scraped. Skipping.")
            continue

        print(f"Scraping data for {username}")
        data = get_profile_data(username)
        
        with open('scraped_data.csv', mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(data)

    print("Finished scraping all usernames!")
    browser.quit()

scrape_usernames_from_csv()
Editor is loading...