Untitled

import requests
import pandas as pd
import numpy as np
import json
import time
import os

def fetch_all_papers(url, params, json_file_path):
    all_papers = []
    
    # Check if the file exists and load existing data
    if os.path.exists(json_file_path):
        try:
            with open(json_file_path, 'r') as f:
                all_papers = json.load(f)
                print(f"Loaded {len(all_papers)} existing papers from {json_file_path}")
        except json.JSONDecodeError:
            print(f"Error loading JSON from {json_file_path}, starting with empty list")
    
    initial_paper_count = len(all_papers)
    try_count = 0
    while True and try_count < 3:
        # Make the GET request
        response = requests.get(url, params=params)
        time.sleep(15)  # 15-second delay after each API call
        
        if response.status_code == 200:
            data = response.json()
            new_papers = data.get("data", [])
            
            if new_papers:  # Only process and save if we got new papers
                # Add the new papers to our list
                all_papers.extend(new_papers)
                
                # Save the updated list to the JSON file (appending new data)
                with open(json_file_path, 'w') as f:
                    json.dump(all_papers, f, indent=2)
                
                print(f"Saved {len(all_papers)} papers to {json_file_path} (added {len(new_papers)} new papers)")
                
                # Check if there's a next page
                if "next" in data:
                    params["offset"] = data["next"]
                else:
                    print("No more pages available")
                    break
            else:
                print("No new papers found in response")
                break
                
            # Limit check
            if len(all_papers) >= 100:
                print("Reached the paper limit (100)")
                break
                
        elif response.status_code == 429:
            print("Rate limit exceeded (429). Waiting 30 seconds...")
            time.sleep(30)  # 30-second delay for rate limiting
            try_count += 1
            
        else:
            print(f"Error: {response.status_code}, {response.text}")
            break
    
    papers_added = len(all_papers) - initial_paper_count
    print(f"Collection completed. Added {papers_added} new papers to {json_file_path}")
    return all_papers


# Query parameters
fields = [
    "paperId", "corpusId", "externalIds", "url", "title", "abstract", "venue",
    "publicationVenue", "year", "referenceCount", "citationCount", 
    "influentialCitationCount", "isOpenAccess", "openAccessPdf", 
    "fieldsOfStudy", "s2FieldsOfStudy", "publicationTypes", "publicationDate",
    "journal", "citationStyles", "authors", "citations", "references",
    "embedding", "tldr"
]


fields_of_study = {
    "Mathematics": ["equations","calculus", "Cryptography", "Probability", "Statistics", "Graph Theory"],
    
    
    "Engineering":["Engineering", "Mechanical ", "Aerospace"],
    
    "Environmental Science": [ "Climate Change", "Renewable Energy", "Biodiversity", 
    "Sustainability", "Waste Management"],
    
    "Business":
    ["Entrepreneurship", "Finance", "Marketing", 
    "Economic Policy", "Business Analytics"],
    
    "Education":
    ["Pedagogy", "Curriculum Development", "Educational Technology", 
    "Special Education", "Higher Education"]
}


for key in fields_of_study.keys():
    print(key)

    
    json_file_path = f"../data/{key}.json"

    if os.path.exists(json_file_path):
        continue
    else:
        with open(json_file_path, 'w') as f:
            json.dump([], f)



    for keyword in fields_of_study[key]:

        
        params = {
                "query": keyword,
                "fields": ",".join(fields)
                
        }

        papers = fetch_all_papers(url,params,json_file_path)
Editor is loading...