Untitled
unknown
plain_text
10 months ago
3.4 kB
3
Indexable
import os
import sqlite3
import pandas as pd
# Set the database path
os.environ['SQLITE_DB_PATH'] = r'/c01/home/lidtd3o/APP_K/app_db_new.db'
def fetch_relevant_data(program_ids):
"""
Fetch relevant data for all program IDs in a single query.
"""
conn = sqlite3.connect(os.getenv('SQLITE_DB_PATH'))
program_ids_tuple = tuple(program_ids)
# Prepare the SQL query
query = f"""
SELECT PROGRAM_NAME, VARIABLE_NAME, VARIABLE_PARENT
FROM BRE_ATTRIBUTE_TBL
WHERE PROGRAM_NAME IN ({','.join(['?'] * len(program_ids_tuple))})
"""
# Fetch data and close the connection
data = pd.read_sql_query(query, conn, params=program_ids_tuple)
conn.close()
return data
def get_child_parent_mapping(df, data):
"""
Map children and parents using the pre-fetched data.
"""
# Create a mapping for quick lookup
parent_child_map = {}
for _, row in data.iterrows():
program = row['PROGRAM_NAME']
var_name = row['VARIABLE_NAME']
var_parent = row['VARIABLE_PARENT']
if program not in parent_child_map:
parent_child_map[program] = {}
parent_child_map[program][var_name] = var_parent
def process_row(row):
"""
Process each row to extract child and parent.
"""
program = row['pgm_name']
var_name = row['target_value']
if not pd.notna(var_name): # Skip empty or NaN values
return None, None
# Handle variables with ' OF ' directly
if ' OF ' in var_name:
parts = [part.strip() for part in var_name.split(' OF ')]
return parts[0], parts[1]
# Look up in parent-child map
mapping = parent_child_map.get(program, {})
parent = mapping.get(var_name, None)
child = next((key for key, value in mapping.items() if value == var_name), None)
return child, parent
# Apply the process_row function to the DataFrame
results = df.apply(process_row, axis=1)
df[['child', 'parent']] = pd.DataFrame(results.tolist(), index=df.index)
return df
def batch_process(df, batch_size=10000):
"""
Process the DataFrame in batches for efficiency.
"""
# Fetch unique program IDs
program_ids = df['pgm_name'].unique()
# Fetch all relevant data for these program IDs
data = fetch_relevant_data(program_ids)
# Process data in batches
batches = [df[i:i+batch_size] for i in range(0, len(df), batch_size)]
results = []
for batch in batches:
processed_batch = get_child_parent_mapping(batch, data)
results.append(processed_batch)
# Combine all batches into a single DataFrame
final_df = pd.concat(results, ignore_index=True)
return final_df
# Example Usage
if __name__ == "__main__":
# Load your large dataset (replace this with your actual file or data source)
df = pd.read_csv('your_large_file.csv') # Replace with your dataset
df = df[['pgm_name', 'target_value']] # Ensure only relevant columns are included
# Process the data in batches
final_df = batch_process(df, batch_size=10000)
# Save the processed results
final_df.to_csv('processed_output.csv', index=False)
print("Batch processing completed. Results saved to 'processed_output.csv'.")
Editor is loading...
Leave a Comment