Untitled

import networkx as nx
import pandas as pd
import sqlite3
import re
import matplotlib.pyplot as plt
import numpy as np
import ast
from functions import fill_conditional_seq, program_to_graph

def process_data(ground_truth_file, database_file, output_file):
    # Load the ground truth CSV
    pgms_csv = pd.read_csv(ground_truth_file)
    list_of_programs = pgms_csv['Program_Name'].to_list()
    list_of_identifiers = pgms_csv['Identifier'].to_list()

    # Connect to the SQLite database
    conn = sqlite3.connect(database_file)
    query = f"""SELECT * FROM BRE_LOGIC_TBL WHERE Program_Name IN ({','.join([f"'{p}'" for p in list_of_programs])})"""
    data = pd.read_sql_query(con=conn, sql=query)

    # Process 'Source_Value'
    data['Source_Value'] = data['Source'].apply(lambda x: ast.literal_eval(x)[0] if isinstance(x, str) else x)

    # Create 'Identifier' column
    data['Identifier'] = data[['Program_Name', 'Target_Value', 'Source_Value', 'Children', 'Parents']].astype(str).agg('/'.join, axis=1)

    # Filter data based on identifiers
    data = data[data['Identifier'].isin(list_of_identifiers)]

    # Remove duplicates
    data = data.drop_duplicates(subset=['Identifier'], keep='first')

    # Create 'Actuals' column
    data['Actuals'] = data[['Program_Name', 'Sequence']].astype(str).agg('_'.join, axis=1)

    # Map 'Program_Snippet'
    data['Program_Snippet'] = data['Program_Name'].map(pgms_csv.set_index('Program_Name')['Program_Snippet'])

    # Save to CSV
    data.to_csv(output_file, index=False)

# Example usage
process_data("ground_truth.csv", "app_db_pc.db", "ground_truth.csv")
Editor is loading...