open_society_foundation_cleaner
Cleans raw JSON retrieved from Open Society Foundation.unknown
python
a year ago
3.4 kB
26
Indexable
import json
import os
import re
import sys
from bs4 import BeautifulSoup
import pandas as pd
DATA_DIR = os.path.join(os.environ['HOME'], 'dev/open-society-foundation/data/RAW/20240923')
def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
return [int(text) if text.isdigit() else text.lower()
for text in _nsre.split(s)]
def get_html(fpath):
# Read raw html/json out of file
try:
with open(fpath, 'r') as f:
html = f.read()
except Exception as e:
print(type(e), e, file=sys.stderr)
return None
# attempt to parse json and extract the 'html' section
try:
obj = json.loads(html)
if len(obj['html']) == 0:
return None
return obj['html']
except json.decoder.JSONDecodeError:
# not a json object. return raw contents
return html
except Exception as e:
# unknown error. print it.
print(fpath, (type(e)), e, file=sys.stderr)
# return None as an error condition.
# it's something we ignore
return None
def extract_grants_database(html):
soup = BeautifulSoup(html, 'html.parser')
resp = soup.find_all('ul', class_="m-grantsDatabase__list")
return resp[0].decode_contents().strip()
def get_inner_html(fpath):
html = get_html(fpath)
if 'm-grantsDatabase__list' in html:
return extract_grants_database(html)
else:
return html
def get_element_data(list_item):
data = {}
soup = BeautifulSoup(list_item, 'html.parser')
try:
obj = soup.find('div', class_="a-grantsDatabase")
data['id'] = obj['id']
except:
print("a-grantsDatabase['id'] not found", file=sys.stderr)
try:
obj = soup.find('h2', class_="a-grantsDatabase__title")
data['title'] = obj.get_text().strip()
except:
print('h2.a-grantsDatabase__title not found', file=sys.stderr)
try:
obj = soup.find('div', class_='a-grantsDatabase__cell--1')
data['year'] = obj.get_text().strip()
except:
print("a-grantsDatabase__cell--1 not found", file=sys.stderr)
try:
obj = soup.find('div', class_='a-grantsDatabase__cell--2')
data['amount'] = obj.get_text().strip()
data['amount_'] = int(re.sub(r"[^0-9]", "", data['amount']))
except:
print("a-grantsDatabase__cell--2 not found", file=sys.stderr)
try:
obj = soup.find('div', class_='a-grantsDatabase__cell--6')
obj = obj.find('p')
data['description'] = obj.get_text().strip().capitalize()
except:
print("a-grantsDatabase__cell--6 not found", file=sys.stderr)
try:
obj = soup.find('div', class_='a-grantsDatabase__cell--meta')
for sub_item in obj.find_all('div', 'a-grantsDatabase__cell--3'):
key = sub_item.find('span').get_text().strip().lower()
value = sub_item.find('p').get_text().strip()
value = ' > '.join([s.strip() for s in value.split("\n")])
data[key] = value
except:
print("a-grantsDatabase__cell--6 not found", file=sys.stderr)
return data
if __name__ == '__main__':
files = [os.path.join(DATA_DIR, f) for f in sorted(os.listdir(DATA_DIR), key=natural_sort_key)]
dataset = [get_element_data(get_inner_html(fpath)) for fpath in files if get_html(fpath) is not None]
pd.DataFrame(dataset).to_csv('open_society_foundation.csv', index=False)
Editor is loading...