process.py
python process.pyunknown
python
9 days ago
7.5 kB
16
Indexable
Never
""" This script processes data on political contributions and provided board memberships records MIT License Copyright (c) 2024 Polish People Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import pandas as pd from pathlib import Path from pprint import pprint LIMITATIONS = """ IMPORTANT NOTE ON ANALYSIS LIMITATIONS: This script identifies similarities between contributor surnames and board member names. However, this approach has several limitations: 1. Surname similarity does not necessarily indicate a real connection or conflict of interest. 2. Common surnames may lead to false positives. 3. The analysis does not account for family relationships beyond exact or partial (-ski, -ska) surname similarity. 4. It does not consider potential connections through other familial relationships (e.g., in-laws). 6. The analysis returns a list of "happy coincidences" of surname similarity, so single contribution can produce multiple records. 7. Surname similarity is likely coincidental in most cases, so treat the results with utmost caution and skepticism. 8. Data sources usually do contain errors, can be incomplete etc., so make sure to validate every record manually. Due to these limitations, each identified case must be investigated manually and individually. The results of this script should be treated as a starting point for further investigation, not as conclusive evidence of any wrongdoing or conflict of interest. Users of this script are responsible for verifying any findings and ensuring that any conclusions drawn comply with applicable laws and ethical standards. """ DATA_DIR = Path("./data") if not DATA_DIR.exists(): print('Data directory "data" not found') exit(1) # Sample data in boards.csv: # full_name,role_name,nip # Jan Kowalski,zarząd,1250028132 # Kasia Kowalska,zarząd,1250028567 BOARDS_PATH = DATA_DIR / "boards.csv" if not BOARDS_PATH.exists(): print('Data file "boards.csv" not found') exit(1) # Sample data in developers.csv: # developer_name,nip # Grupa Domki Tanie,5251495533 # Grupa Bloczki Fajne,3371821726 DEVELOPERS_PATH = DATA_DIR / "developers.csv" if not DEVELOPERS_PATH.exists(): print('Data file "developers.csv" not found') exit(1) # Records of surname similarity between contributors and board members will be saved here OUTPUT_PATH = DATA_DIR / "coincidences.csv" CONTRIBUTIONS = { "pis": DATA_DIR / "wplaty-na-pis-2023-2024-pastecode_io.txt", "po": DATA_DIR / "wplaty-na-po-2023-2024-pastecode_io.txt", "psl": DATA_DIR / "wplaty-na-psl-2023-2024-pastecode_io.txt", } COLUMNS_REMAP = { "data": "date", "kwota": "amount", "miejscowość": "city", "imię ojca": "father_name", "imię": "name", "nazwisko": "surname", } def normalize_name(name): return " ".join(str(name).lower().split()) if pd.notna(name) else "" def parse_contributions(): dfs = [] for party, path in CONTRIBUTIONS.items(): df = pd.read_csv(path) df["party"] = party df.rename(columns=COLUMNS_REMAP, inplace=True) expected_columns = set(COLUMNS_REMAP.values()) | {"party"} assert expected_columns.issubset( set(df.columns) ), f"Columns mismatch for {party}: {df.columns} vs {expected_columns}" for col in ["name", "surname", "father_name", "city"]: df[col] = df[col].apply(normalize_name) dfs.append(df) print(f"Found {len(df)} rows for {party}") df = pd.concat(dfs, ignore_index=True) df = df.sort_values( by=["amount", "party", "surname"], ascending=[False, True, True] ) df["full_name"] = df["name"] + " " + df["surname"] df["percentile"] = df["amount"].rank(pct=True).apply(lambda x: round(x, 2)) df.to_csv(DATA_DIR / "contributions.csv", index=False) return df def combine_developers_with_boards(): df_devs = pd.read_csv(DEVELOPERS_PATH) df_boards = pd.read_csv(BOARDS_PATH) df_boards_ext = pd.merge(df_boards, df_devs, on="nip", how="left") for col in ["full_name", "role_name", "developer_name"]: df_boards_ext[col] = df_boards_ext[col].apply(normalize_name) df_boards_ext.to_csv(BOARDS_PATH.with_name("boards_ext.csv"), index=False) return df_boards_ext def is_happy_coincidence(contributor_surname, board_member_name): for word in board_member_name.split(): if contributor_surname == word or ( len(contributor_surname) > 1 and len(word) > 1 and contributor_surname[:-1] == word[:-1] ): return True return False def find_happy_coincidences(df_boards, df_contributions): coincidences = [] for _, contribution in df_contributions.iterrows(): contributor_surname = contribution["surname"] for _, board_member in df_boards.iterrows(): if is_happy_coincidence(contributor_surname, board_member["full_name"]): match = { "contributor_name": contribution["full_name"], "coincidence_board_member_name": board_member["full_name"], "coincidence_developer_name": board_member.get( "developer_name", "" ), "coincidence_developer_nip": board_member["nip"], "contribution_amount": int(contribution["amount"]), "contribution_amount_percentile": contribution["percentile"], "contribution_party": contribution["party"], "contribution_date": contribution["date"], } pprint(match) coincidences.append(match) return coincidences def main(): print(LIMITATIONS) if input("Do you accept the limitations? (yes/no): ").strip().lower() != "yes": return print("Parsing contributions...") df_contributions = parse_contributions() print("\nCombining developers with board records...") df_boards = combine_developers_with_boards() print("\nFinding happy coincidences...") coincidences = find_happy_coincidences(df_boards, df_contributions) df_out = pd.DataFrame(coincidences) df_out = df_out.sort_values( by=["coincidence_developer_name", "contribution_amount"], ascending=[True, False], ) print("\nResults:") print(df_out.to_markdown(index=False)) df_out.to_csv(OUTPUT_PATH, index=False) print(f"\nSaved to {OUTPUT_PATH}") if __name__ == "__main__": main()