Untitled
unknown
plain_text
9 months ago
4.0 kB
11
Indexable
import requests
import pandas as pd
import numpy as np
import json
import time
import os
def fetch_all_papers(url, params, json_file_path):
all_papers = []
# Check if the file exists and load existing data
if os.path.exists(json_file_path):
try:
with open(json_file_path, 'r') as f:
all_papers = json.load(f)
print(f"Loaded {len(all_papers)} existing papers from {json_file_path}")
except json.JSONDecodeError:
print(f"Error loading JSON from {json_file_path}, starting with empty list")
initial_paper_count = len(all_papers)
try_count = 0
while True and try_count < 3:
# Make the GET request
response = requests.get(url, params=params)
time.sleep(15) # 15-second delay after each API call
if response.status_code == 200:
data = response.json()
new_papers = data.get("data", [])
if new_papers: # Only process and save if we got new papers
# Add the new papers to our list
all_papers.extend(new_papers)
# Save the updated list to the JSON file (appending new data)
with open(json_file_path, 'w') as f:
json.dump(all_papers, f, indent=2)
print(f"Saved {len(all_papers)} papers to {json_file_path} (added {len(new_papers)} new papers)")
# Check if there's a next page
if "next" in data:
params["offset"] = data["next"]
else:
print("No more pages available")
break
else:
print("No new papers found in response")
break
# Limit check
if len(all_papers) >= 100:
print("Reached the paper limit (100)")
break
elif response.status_code == 429:
print("Rate limit exceeded (429). Waiting 30 seconds...")
time.sleep(30) # 30-second delay for rate limiting
try_count += 1
else:
print(f"Error: {response.status_code}, {response.text}")
break
papers_added = len(all_papers) - initial_paper_count
print(f"Collection completed. Added {papers_added} new papers to {json_file_path}")
return all_papers
# Query parameters
fields = [
"paperId", "corpusId", "externalIds", "url", "title", "abstract", "venue",
"publicationVenue", "year", "referenceCount", "citationCount",
"influentialCitationCount", "isOpenAccess", "openAccessPdf",
"fieldsOfStudy", "s2FieldsOfStudy", "publicationTypes", "publicationDate",
"journal", "citationStyles", "authors", "citations", "references",
"embedding", "tldr"
]
fields_of_study = {
"Mathematics": ["equations","calculus", "Cryptography", "Probability", "Statistics", "Graph Theory"],
"Engineering":["Engineering", "Mechanical ", "Aerospace"],
"Environmental Science": [ "Climate Change", "Renewable Energy", "Biodiversity",
"Sustainability", "Waste Management"],
"Business":
["Entrepreneurship", "Finance", "Marketing",
"Economic Policy", "Business Analytics"],
"Education":
["Pedagogy", "Curriculum Development", "Educational Technology",
"Special Education", "Higher Education"]
}
for key in fields_of_study.keys():
print(key)
json_file_path = f"../data/{key}.json"
if os.path.exists(json_file_path):
continue
else:
with open(json_file_path, 'w') as f:
json.dump([], f)
for keyword in fields_of_study[key]:
params = {
"query": keyword,
"fields": ",".join(fields)
}
papers = fetch_all_papers(url,params,json_file_path)
Editor is loading...
Leave a Comment