Untitled
unknown
plain_text
3 months ago
3.4 kB
5
Indexable
import json import gzip import os import pandas as pd # Try to import tqdm for a progress bar; if not available, we'll print progress manually. try: from tqdm import tqdm except ImportError: tqdm = None # IP addresses to filter by target_ips = ['167.71.174.44', '162.33.179.99'] def process_file(file_path): """ Process a single .json.gz file and return a list of dictionaries for records that match the target source IP addresses. """ records = [] try: with gzip.open(file_path, 'rt', encoding='utf-8') as f: data = json.load(f) if 'Records' in data: for record in data['Records']: src_ip = record.get('sourceIPAddress', '').strip() if src_ip in target_ips: event_time = record.get('eventTime', 'N/A') # Handle userAgent (if it's a list, take the first element) user_agent = record.get('userAgent', 'N/A') if isinstance(user_agent, list): user_agent = user_agent[0] if user_agent else 'N/A' event_name = record.get('eventName', 'N/A') username = record.get('userIdentity', {}).get('userName', 'N/A') # Extract bucketName and Host from requestParameters request_params = record.get('requestParameters', {}) bucket_name = request_params.get('bucketName', 'N/A') host = request_params.get('Host', 'N/A') record_dict = { 'sourceIPAddress': src_ip, 'eventTime': event_time, 'userAgent': user_agent, 'eventName': event_name, 'userIdentity.userName': username, 'bucketName': bucket_name, 'Host': host } records.append(record_dict) except Exception as e: print(f"Error processing file {file_path}: {e}") return records def main(): all_records = [] file_paths = [] # Walk through the directory recursively to collect all .json.gz file paths. for root, dirs, files in os.walk('.'): for file in files: if file.endswith('.json.gz'): file_paths.append(os.path.join(root, file)) total_files = len(file_paths) print(f"Found {total_files} .json.gz files to process.") # Use tqdm if available; otherwise, iterate normally. iterator = tqdm(file_paths, total=total_files, desc="Processing files") if tqdm else file_paths for idx, file_path in enumerate(iterator, start=1): file_records = process_file(file_path) if file_records: all_records.extend(file_records) if not tqdm: print(f"Processed {idx}/{total_files} files.") if all_records: df = pd.DataFrame(all_records) output_csv = 'filtered_cloudtrail_data.csv' df.to_csv(output_csv, index=False) print(f"Saved {len(all_records)} records to {output_csv}") else: print("No matching records found.") if __name__ == "__main__": main()
Editor is loading...
Leave a Comment