Untitled
def find_previous_seq_id_flow(df, start_seq_id, specific_target=None): """ Finds a sequence of previous seq_ids based on source and target values. Optionally filters the starting point by a specific target. """ flow = [] current_seq_id = start_seq_id while current_seq_id: # Filter rows corresponding to the current_seq_id rows = df[df['Sequence'] == current_seq_id] if rows.empty: break # Exit if no rows are found for the current_seq_id # Optionally filter rows by the specific target if specific_target: rows = rows[rows['Target'] == specific_target] # Exit if no rows match the target filter if rows.empty: break # Extract the first row (or the first match) row = rows.iloc[0] # Extract source values for the current seq_id source_values = preprocess_values(row['Source']) if not source_values: break # Exit if no source values are found # Take the first source value first_source = source_values[0] # Filter DataFrame to include rows with seq_id less than the current_seq_id previous_df = df[df['Sequence'] < current_seq_id].copy() # Parse the target values in the filtered DataFrame previous_df['parsed_targets'] = previous_df['Target'].apply( lambda x: preprocess_values(x) ) # Find the seq_id where the first_source is in the parsed_targets matching_rows = previous_df[previous_df['parsed_targets'].apply( lambda targets: first_source in targets )] # Check if any matching rows exist if not matching_rows.empty: previous_seq_id = matching_rows['Sequence'].max() # Append the current_seq_id to the flow list and move to the previous_seq_id flow.append(current_seq_id) current_seq_id = int(previous_seq_id) else: break # Exit if no previous_seq_id is found # Append the final seq_id to the flow list if it is valid and not already included if current_seq_id not in flow and not pd.isna(current_seq_id): flow.append(current_seq_id) return flow
Leave a Comment