Untitled

 avatar
unknown
plain_text
10 months ago
1.6 kB
11
Indexable
import networkx as nx
import pandas as pd
import sqlite3
import re
import matplotlib.pyplot as plt
import numpy as np
import ast
from functions import fill_conditional_seq, program_to_graph

def process_data(ground_truth_file, database_file, output_file):
    # Load the ground truth CSV
    pgms_csv = pd.read_csv(ground_truth_file)
    list_of_programs = pgms_csv['Program_Name'].to_list()
    list_of_identifiers = pgms_csv['Identifier'].to_list()

    # Connect to the SQLite database
    conn = sqlite3.connect(database_file)
    query = f"""SELECT * FROM BRE_LOGIC_TBL WHERE Program_Name IN ({','.join([f"'{p}'" for p in list_of_programs])})"""
    data = pd.read_sql_query(con=conn, sql=query)

    # Process 'Source_Value'
    data['Source_Value'] = data['Source'].apply(lambda x: ast.literal_eval(x)[0] if isinstance(x, str) else x)

    # Create 'Identifier' column
    data['Identifier'] = data[['Program_Name', 'Target_Value', 'Source_Value', 'Children', 'Parents']].astype(str).agg('/'.join, axis=1)

    # Filter data based on identifiers
    data = data[data['Identifier'].isin(list_of_identifiers)]

    # Remove duplicates
    data = data.drop_duplicates(subset=['Identifier'], keep='first')

    # Create 'Actuals' column
    data['Actuals'] = data[['Program_Name', 'Sequence']].astype(str).agg('_'.join, axis=1)

    # Map 'Program_Snippet'
    data['Program_Snippet'] = data['Program_Name'].map(pgms_csv.set_index('Program_Name')['Program_Snippet'])

    # Save to CSV
    data.to_csv(output_file, index=False)

# Example usage
process_data("ground_truth.csv", "app_db_pc.db", "ground_truth.csv")
Editor is loading...
Leave a Comment