open_society_foundation_cleaner
Cleans raw JSON retrieved from Open Society Foundation.unknown
python
10 days ago
3.4 kB
7
Indexable
Never
import json import os import re import sys from bs4 import BeautifulSoup import pandas as pd DATA_DIR = os.path.join(os.environ['HOME'], 'dev/open-society-foundation/data/RAW/20240923') def natural_sort_key(s, _nsre=re.compile('([0-9]+)')): return [int(text) if text.isdigit() else text.lower() for text in _nsre.split(s)] def get_html(fpath): # Read raw html/json out of file try: with open(fpath, 'r') as f: html = f.read() except Exception as e: print(type(e), e, file=sys.stderr) return None # attempt to parse json and extract the 'html' section try: obj = json.loads(html) if len(obj['html']) == 0: return None return obj['html'] except json.decoder.JSONDecodeError: # not a json object. return raw contents return html except Exception as e: # unknown error. print it. print(fpath, (type(e)), e, file=sys.stderr) # return None as an error condition. # it's something we ignore return None def extract_grants_database(html): soup = BeautifulSoup(html, 'html.parser') resp = soup.find_all('ul', class_="m-grantsDatabase__list") return resp[0].decode_contents().strip() def get_inner_html(fpath): html = get_html(fpath) if 'm-grantsDatabase__list' in html: return extract_grants_database(html) else: return html def get_element_data(list_item): data = {} soup = BeautifulSoup(list_item, 'html.parser') try: obj = soup.find('div', class_="a-grantsDatabase") data['id'] = obj['id'] except: print("a-grantsDatabase['id'] not found", file=sys.stderr) try: obj = soup.find('h2', class_="a-grantsDatabase__title") data['title'] = obj.get_text().strip() except: print('h2.a-grantsDatabase__title not found', file=sys.stderr) try: obj = soup.find('div', class_='a-grantsDatabase__cell--1') data['year'] = obj.get_text().strip() except: print("a-grantsDatabase__cell--1 not found", file=sys.stderr) try: obj = soup.find('div', class_='a-grantsDatabase__cell--2') data['amount'] = obj.get_text().strip() data['amount_'] = int(re.sub(r"[^0-9]", "", data['amount'])) except: print("a-grantsDatabase__cell--2 not found", file=sys.stderr) try: obj = soup.find('div', class_='a-grantsDatabase__cell--6') obj = obj.find('p') data['description'] = obj.get_text().strip().capitalize() except: print("a-grantsDatabase__cell--6 not found", file=sys.stderr) try: obj = soup.find('div', class_='a-grantsDatabase__cell--meta') for sub_item in obj.find_all('div', 'a-grantsDatabase__cell--3'): key = sub_item.find('span').get_text().strip().lower() value = sub_item.find('p').get_text().strip() value = ' > '.join([s.strip() for s in value.split("\n")]) data[key] = value except: print("a-grantsDatabase__cell--6 not found", file=sys.stderr) return data if __name__ == '__main__': files = [os.path.join(DATA_DIR, f) for f in sorted(os.listdir(DATA_DIR), key=natural_sort_key)] dataset = [get_element_data(get_inner_html(fpath)) for fpath in files if get_html(fpath) is not None] pd.DataFrame(dataset).to_csv('open_society_foundation.csv', index=False)