open_society_foundation_cleaner

Cleans raw JSON retrieved from Open Society Foundation.
mail@pastecode.io avatar
unknown
python
10 days ago
3.4 kB
7
Indexable
Never
import json
import os
import re
import sys

from bs4 import BeautifulSoup
import pandas as pd



DATA_DIR = os.path.join(os.environ['HOME'], 'dev/open-society-foundation/data/RAW/20240923')


def natural_sort_key(s, _nsre=re.compile('([0-9]+)')):
    return [int(text) if text.isdigit() else text.lower()
            for text in _nsre.split(s)]


def get_html(fpath):
    # Read raw html/json out of file
    try:
        with open(fpath, 'r') as f:
            html = f.read()
    except Exception as e:
        print(type(e), e, file=sys.stderr)
        return None
    # attempt to parse json and extract the 'html' section
    try:
        obj = json.loads(html)
        if len(obj['html']) == 0:
            return None
        return obj['html']
    except json.decoder.JSONDecodeError:
        # not a json object. return raw contents
        return html
    except Exception as e:
        # unknown error. print it.
        print(fpath, (type(e)), e, file=sys.stderr)
    # return None as an error condition.
    # it's something we ignore
    return None


def extract_grants_database(html):
    soup = BeautifulSoup(html, 'html.parser')
    resp = soup.find_all('ul', class_="m-grantsDatabase__list")
    return resp[0].decode_contents().strip()


def get_inner_html(fpath):
    html = get_html(fpath)
    if 'm-grantsDatabase__list' in html:
        return extract_grants_database(html)
    else:
        return html


def get_element_data(list_item):
    data = {}
    soup = BeautifulSoup(list_item, 'html.parser')
    
    try:
        obj = soup.find('div', class_="a-grantsDatabase")
        data['id'] = obj['id']
    except:
        print("a-grantsDatabase['id'] not found", file=sys.stderr)
    
    try:
        obj = soup.find('h2', class_="a-grantsDatabase__title")
        data['title'] = obj.get_text().strip()
    except:
        print('h2.a-grantsDatabase__title not found', file=sys.stderr)
    
    try:
        obj = soup.find('div', class_='a-grantsDatabase__cell--1')
        data['year'] = obj.get_text().strip()
    except:
        print("a-grantsDatabase__cell--1 not found", file=sys.stderr)

    try:
        obj = soup.find('div', class_='a-grantsDatabase__cell--2')
        data['amount'] = obj.get_text().strip()
        data['amount_'] = int(re.sub(r"[^0-9]", "", data['amount']))
    except:
        print("a-grantsDatabase__cell--2 not found", file=sys.stderr)

    try:
        obj = soup.find('div', class_='a-grantsDatabase__cell--6')
        obj = obj.find('p')
        data['description'] = obj.get_text().strip().capitalize()
    except:
        print("a-grantsDatabase__cell--6 not found", file=sys.stderr)

    try:
        obj = soup.find('div', class_='a-grantsDatabase__cell--meta')
        for sub_item in obj.find_all('div', 'a-grantsDatabase__cell--3'):
            key = sub_item.find('span').get_text().strip().lower()
            value = sub_item.find('p').get_text().strip()
            value = ' > '.join([s.strip() for s in value.split("\n")])
            data[key] = value
    except:
        print("a-grantsDatabase__cell--6 not found", file=sys.stderr)

    return data



if __name__ == '__main__':
    files = [os.path.join(DATA_DIR, f) for f in sorted(os.listdir(DATA_DIR), key=natural_sort_key)]
    dataset = [get_element_data(get_inner_html(fpath)) for fpath in files if get_html(fpath) is not None]
    pd.DataFrame(dataset).to_csv('open_society_foundation.csv', index=False)