Untitled
unknown
plain_text
a month ago
3.4 kB
2
Indexable
import os import sqlite3 import pandas as pd # Set the database path os.environ['SQLITE_DB_PATH'] = r'/c01/home/lidtd3o/APP_K/app_db_new.db' def fetch_relevant_data(program_ids): """ Fetch relevant data for all program IDs in a single query. """ conn = sqlite3.connect(os.getenv('SQLITE_DB_PATH')) program_ids_tuple = tuple(program_ids) # Prepare the SQL query query = f""" SELECT PROGRAM_NAME, VARIABLE_NAME, VARIABLE_PARENT FROM BRE_ATTRIBUTE_TBL WHERE PROGRAM_NAME IN ({','.join(['?'] * len(program_ids_tuple))}) """ # Fetch data and close the connection data = pd.read_sql_query(query, conn, params=program_ids_tuple) conn.close() return data def get_child_parent_mapping(df, data): """ Map children and parents using the pre-fetched data. """ # Create a mapping for quick lookup parent_child_map = {} for _, row in data.iterrows(): program = row['PROGRAM_NAME'] var_name = row['VARIABLE_NAME'] var_parent = row['VARIABLE_PARENT'] if program not in parent_child_map: parent_child_map[program] = {} parent_child_map[program][var_name] = var_parent def process_row(row): """ Process each row to extract child and parent. """ program = row['pgm_name'] var_name = row['target_value'] if not pd.notna(var_name): # Skip empty or NaN values return None, None # Handle variables with ' OF ' directly if ' OF ' in var_name: parts = [part.strip() for part in var_name.split(' OF ')] return parts[0], parts[1] # Look up in parent-child map mapping = parent_child_map.get(program, {}) parent = mapping.get(var_name, None) child = next((key for key, value in mapping.items() if value == var_name), None) return child, parent # Apply the process_row function to the DataFrame results = df.apply(process_row, axis=1) df[['child', 'parent']] = pd.DataFrame(results.tolist(), index=df.index) return df def batch_process(df, batch_size=10000): """ Process the DataFrame in batches for efficiency. """ # Fetch unique program IDs program_ids = df['pgm_name'].unique() # Fetch all relevant data for these program IDs data = fetch_relevant_data(program_ids) # Process data in batches batches = [df[i:i+batch_size] for i in range(0, len(df), batch_size)] results = [] for batch in batches: processed_batch = get_child_parent_mapping(batch, data) results.append(processed_batch) # Combine all batches into a single DataFrame final_df = pd.concat(results, ignore_index=True) return final_df # Example Usage if __name__ == "__main__": # Load your large dataset (replace this with your actual file or data source) df = pd.read_csv('your_large_file.csv') # Replace with your dataset # Process the data in batches final_df = batch_process(df, batch_size=10000) # Save the processed results with all columns final_df.to_csv('processed_output_with_all_columns.csv', index=False) print("Batch processing completed. Results saved to 'processed_output_with_all_columns.csv'.")
Editor is loading...
Leave a Comment