2 years ago
6.5 kB
import argparse import asyncio import platform import re import sys import time import httpx from bs4 import BeautifulSoup class Scraper: def __init__(self, method, _url): self.method = method self._url = _url def get_url(self, **kwargs): return self._url.format(**kwargs, method=self.method) async def get_response(self, client): return await client.get(self.get_url()) async def handle(self, response): return response.text async def scrape(self, client): response = await self.get_response(client) proxies = await self.handle(response) pattern = re.compile(r"\d{1,3}(?:\.\d{1,3}){3}(?::\d{1,5})?") return re.findall(pattern, proxies) # From spys.me class SpysMeScraper(Scraper): def __init__(self, method): super().__init__(method, "https://spys.me/{mode}.txt") def get_url(self, **kwargs): mode = "proxy" if self.method == "http" else "socks" if self.method == "socks" else "unknown" if mode == "unknown": raise NotImplementedError return super().get_url(mode=mode, **kwargs) # From proxyscrape.com class ProxyScrapeScraper(Scraper): def __init__(self, method, timeout=1000, country="All"): self.timout = timeout self.country = country super().__init__(method, "https://api.proxyscrape.com/?request=getproxies" "&proxytype={method}" "&timeout={timout}" "&country={country}") def get_url(self, **kwargs): return super().get_url(timout=self.timout, country=self.country, **kwargs) # From geonode.com - A little dirty, grab http(s) and socks but use just for socks class GeoNodeScraper(Scraper): def __init__(self, method, limit="500", page="1", sort_by="lastChecked", sort_type="desc"): self.limit = limit self.page = page self.sort_by = sort_by self.sort_type = sort_type super().__init__(method, "https://proxylist.geonode.com/api/proxy-list?" "&limit={limit}" "&page={page}" "&sort_by={sort_by}" "&sort_type={sort_type}") def get_url(self, **kwargs): return super().get_url(limit=self.limit, page=self.page, sort_by=self.sort_by, sort_type=self.sort_type, **kwargs) # From proxy-list.download class ProxyListDownloadScraper(Scraper): def __init__(self, method, anon): self.anon = anon super().__init__(method, "https://www.proxy-list.download/api/v1/get?type={method}&anon={anon}") def get_url(self, **kwargs): return super().get_url(anon=self.anon, **kwargs) # For websites using table in html class GeneralTableScraper(Scraper): async def handle(self, response): soup = BeautifulSoup(response.text, "html.parser") proxies = set() table = soup.find("table", attrs={"class": "table table-striped table-bordered"}) for row in table.findAll("tr"): count = 0 proxy = "" for cell in row.findAll("td"): if count == 1: proxy += ":" + cell.text.replace(" ", "") proxies.add(proxy) break proxy += cell.text.replace(" ", "") count += 1 return "\n".join(proxies) scrapers = [ SpysMeScraper("http"), SpysMeScraper("socks"), ProxyScrapeScraper("http"), ProxyScrapeScraper("socks4"), ProxyScrapeScraper("socks5"), GeoNodeScraper("socks"), ProxyListDownloadScraper("https", "elite"), ProxyListDownloadScraper("http", "elite"), ProxyListDownloadScraper("http", "transparent"), ProxyListDownloadScraper("http", "anonymous"), GeneralTableScraper("https", "http://sslproxies.org"), GeneralTableScraper("http", "http://free-proxy-list.net"), GeneralTableScraper("http", "http://us-proxy.org"), GeneralTableScraper("socks", "http://socks-proxy.net"), ] def verbose_print(verbose, message): if verbose: print(message) async def scrape(method, output, verbose): now = time.time() methods = [method] if method == "socks": methods += ["socks4", "socks5"] proxy_scrapers = [s for s in scrapers if s.method in methods] if not proxy_scrapers: raise ValueError("Method not supported") verbose_print(verbose, "Scraping proxies...") proxies = [] tasks = [] client = httpx.AsyncClient(follow_redirects=True) async def scrape_scraper(scraper): try: verbose_print(verbose, f"Looking {scraper.get_url()}...") proxies.extend(await scraper.scrape(client)) except Exception: pass for scraper in proxy_scrapers: tasks.append(asyncio.ensure_future(scrape_scraper(scraper))) await asyncio.gather(*tasks) await client.aclose() verbose_print(verbose, f"Writing {len(proxies)} proxies to file...") with open(output, "w") as f: f.write("\n".join(proxies)) verbose_print(verbose, "Done!") verbose_print(verbose, f"Took {time.time() - now} seconds") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-p", "--proxy", help="Supported proxy type: " + ", ".join(sorted(set([s.method for s in scrapers]))), required=True, ) parser.add_argument( "-o", "--output", help="Output file name to save .txt file", default="output.txt", ) parser.add_argument( "-v", "--verbose", help="Increase output verbosity", action="store_true", ) args = parser.parse_args() if sys.version_info >= (3, 7) and platform.system() == 'Windows': loop = asyncio.get_event_loop() loop.run_until_complete(scrape(args.proxy, args.output, args.verbose)) loop.close() elif sys.version_info >= (3, 7): asyncio.run(scrape(args.proxy, args.output, args.verbose)) else: loop = asyncio.get_event_loop() loop.run_until_complete(scrape(args.proxy, args.output, args.verbose)) loop.close()
Editor is loading...