process.py

"""

This script processes data on political contributions and provided board memberships records

MIT License

Copyright (c) 2024 Polish People

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

"""

import pandas as pd
from pathlib import Path
from pprint import pprint

LIMITATIONS = """
IMPORTANT NOTE ON ANALYSIS LIMITATIONS:
This script identifies similarities between contributor surnames and board member names.
However, this approach has several limitations:

1. Surname similarity does not necessarily indicate a real connection or conflict of interest.
2. Common surnames may lead to false positives.
3. The analysis does not account for family relationships beyond exact or partial (-ski, -ska) surname similarity.
4. It does not consider potential connections through other familial relationships (e.g., in-laws).
6. The analysis returns a list of "happy coincidences" of surname similarity, so single contribution can produce multiple records.
7. Surname similarity is likely coincidental in most cases, so treat the results with utmost caution and skepticism.
8. Data sources usually do contain errors, can be incomplete etc., so make sure to validate every record manually.

Due to these limitations, each identified case must be investigated manually and individually.
The results of this script should be treated as a starting point for further investigation,
not as conclusive evidence of any wrongdoing or conflict of interest.

Users of this script are responsible for verifying any findings and ensuring
that any conclusions drawn comply with applicable laws and ethical standards.
"""


DATA_DIR = Path("./data")
if not DATA_DIR.exists():
    print('Data directory "data" not found')
    exit(1)

# Sample data in boards.csv:
# full_name,role_name,nip
# Jan Kowalski,zarząd,1250028132
# Kasia Kowalska,zarząd,1250028567
BOARDS_PATH = DATA_DIR / "boards.csv"
if not BOARDS_PATH.exists():
    print('Data file "boards.csv" not found')
    exit(1)

# Sample data in developers.csv:
# developer_name,nip
# Grupa Domki Tanie,5251495533
# Grupa Bloczki Fajne,3371821726
DEVELOPERS_PATH = DATA_DIR / "developers.csv"
if not DEVELOPERS_PATH.exists():
    print('Data file "developers.csv" not found')
    exit(1)

# Records of surname similarity between contributors and board members will be saved here
OUTPUT_PATH = DATA_DIR / "coincidences.csv"

CONTRIBUTIONS = {
    "pis": DATA_DIR / "wplaty-na-pis-2023-2024-pastecode_io.txt",
    "po": DATA_DIR / "wplaty-na-po-2023-2024-pastecode_io.txt",
    "psl": DATA_DIR / "wplaty-na-psl-2023-2024-pastecode_io.txt",
}

COLUMNS_REMAP = {
    "data": "date",
    "kwota": "amount",
    "miejscowość": "city",
    "imię ojca": "father_name",
    "imię": "name",
    "nazwisko": "surname",
}


def normalize_name(name):
    return " ".join(str(name).lower().split()) if pd.notna(name) else ""


def parse_contributions():
    dfs = []
    for party, path in CONTRIBUTIONS.items():
        df = pd.read_csv(path)
        df["party"] = party
        df.rename(columns=COLUMNS_REMAP, inplace=True)
        expected_columns = set(COLUMNS_REMAP.values()) | {"party"}
        assert expected_columns.issubset(
            set(df.columns)
        ), f"Columns mismatch for {party}: {df.columns} vs {expected_columns}"
        for col in ["name", "surname", "father_name", "city"]:
            df[col] = df[col].apply(normalize_name)
        dfs.append(df)
        print(f"Found {len(df)} rows for {party}")

    df = pd.concat(dfs, ignore_index=True)
    df = df.sort_values(
        by=["amount", "party", "surname"], ascending=[False, True, True]
    )
    df["full_name"] = df["name"] + " " + df["surname"]
    df["percentile"] = df["amount"].rank(pct=True).apply(lambda x: round(x, 2))
    df.to_csv(DATA_DIR / "contributions.csv", index=False)
    return df


def combine_developers_with_boards():
    df_devs = pd.read_csv(DEVELOPERS_PATH)
    df_boards = pd.read_csv(BOARDS_PATH)
    df_boards_ext = pd.merge(df_boards, df_devs, on="nip", how="left")
    for col in ["full_name", "role_name", "developer_name"]:
        df_boards_ext[col] = df_boards_ext[col].apply(normalize_name)
    df_boards_ext.to_csv(BOARDS_PATH.with_name("boards_ext.csv"), index=False)

    return df_boards_ext


def is_happy_coincidence(contributor_surname, board_member_name):
    for word in board_member_name.split():
        if contributor_surname == word or (
            len(contributor_surname) > 1
            and len(word) > 1
            and contributor_surname[:-1] == word[:-1]
        ):
            return True
    return False


def find_happy_coincidences(df_boards, df_contributions):
    coincidences = []
    for _, contribution in df_contributions.iterrows():
        contributor_surname = contribution["surname"]
        for _, board_member in df_boards.iterrows():
            if is_happy_coincidence(contributor_surname, board_member["full_name"]):
                match = {
                    "contributor_name": contribution["full_name"],
                    "coincidence_board_member_name": board_member["full_name"],
                    "coincidence_developer_name": board_member.get(
                        "developer_name", ""
                    ),
                    "coincidence_developer_nip": board_member["nip"],
                    "contribution_amount": int(contribution["amount"]),
                    "contribution_amount_percentile": contribution["percentile"],
                    "contribution_party": contribution["party"],
                    "contribution_date": contribution["date"],
                }
                pprint(match)
                coincidences.append(match)
    return coincidences


def main():
    print(LIMITATIONS)
    if input("Do you accept the limitations? (yes/no): ").strip().lower() != "yes":
        return

    print("Parsing contributions...")
    df_contributions = parse_contributions()

    print("\nCombining developers with board records...")
    df_boards = combine_developers_with_boards()

    print("\nFinding happy coincidences...")
    coincidences = find_happy_coincidences(df_boards, df_contributions)

    df_out = pd.DataFrame(coincidences)
    df_out = df_out.sort_values(
        by=["coincidence_developer_name", "contribution_amount"],
        ascending=[True, False],
    )

    print("\nResults:")
    print(df_out.to_markdown(index=False))

    df_out.to_csv(OUTPUT_PATH, index=False)
    print(f"\nSaved to {OUTPUT_PATH}")


if __name__ == "__main__":
    main()