Untitled
unknown
plain_text
8 days ago
4.0 kB
7
Indexable
import requests import pandas as pd import numpy as np import json import time import os def fetch_all_papers(url, params, json_file_path): all_papers = [] # Check if the file exists and load existing data if os.path.exists(json_file_path): try: with open(json_file_path, 'r') as f: all_papers = json.load(f) print(f"Loaded {len(all_papers)} existing papers from {json_file_path}") except json.JSONDecodeError: print(f"Error loading JSON from {json_file_path}, starting with empty list") initial_paper_count = len(all_papers) try_count = 0 while True and try_count < 3: # Make the GET request response = requests.get(url, params=params) time.sleep(15) # 15-second delay after each API call if response.status_code == 200: data = response.json() new_papers = data.get("data", []) if new_papers: # Only process and save if we got new papers # Add the new papers to our list all_papers.extend(new_papers) # Save the updated list to the JSON file (appending new data) with open(json_file_path, 'w') as f: json.dump(all_papers, f, indent=2) print(f"Saved {len(all_papers)} papers to {json_file_path} (added {len(new_papers)} new papers)") # Check if there's a next page if "next" in data: params["offset"] = data["next"] else: print("No more pages available") break else: print("No new papers found in response") break # Limit check if len(all_papers) >= 100: print("Reached the paper limit (100)") break elif response.status_code == 429: print("Rate limit exceeded (429). Waiting 30 seconds...") time.sleep(30) # 30-second delay for rate limiting try_count += 1 else: print(f"Error: {response.status_code}, {response.text}") break papers_added = len(all_papers) - initial_paper_count print(f"Collection completed. Added {papers_added} new papers to {json_file_path}") return all_papers # Query parameters fields = [ "paperId", "corpusId", "externalIds", "url", "title", "abstract", "venue", "publicationVenue", "year", "referenceCount", "citationCount", "influentialCitationCount", "isOpenAccess", "openAccessPdf", "fieldsOfStudy", "s2FieldsOfStudy", "publicationTypes", "publicationDate", "journal", "citationStyles", "authors", "citations", "references", "embedding", "tldr" ] fields_of_study = { "Mathematics": ["equations","calculus", "Cryptography", "Probability", "Statistics", "Graph Theory"], "Engineering":["Engineering", "Mechanical ", "Aerospace"], "Environmental Science": [ "Climate Change", "Renewable Energy", "Biodiversity", "Sustainability", "Waste Management"], "Business": ["Entrepreneurship", "Finance", "Marketing", "Economic Policy", "Business Analytics"], "Education": ["Pedagogy", "Curriculum Development", "Educational Technology", "Special Education", "Higher Education"] } for key in fields_of_study.keys(): print(key) json_file_path = f"../data/{key}.json" if os.path.exists(json_file_path): continue else: with open(json_file_path, 'w') as f: json.dump([], f) for keyword in fields_of_study[key]: params = { "query": keyword, "fields": ",".join(fields) } papers = fetch_all_papers(url,params,json_file_path)
Editor is loading...
Leave a Comment