Untitled
unknown
plain_text
2 years ago
1.8 kB
1
Indexable
Never
class ConvertLogFileToDataFrameDoFn(beam.DoFn): def process(self, element): file_path = element print(file_path) list_of_files.append(file_path) print(list_of_files) # read the text file and split each line by comma with open(file_path, "r") as f: lines = f.readlines() # create an empty DataFrame with the specified columns columns=['timestamp','hostname','process_name','process_id','log_text'] df = pd.DataFrame(columns=columns) # use regular expressions to extract the values for each column data = [] for line in lines: timestamp_match = re.search(r'Time: (\S+)', line) hostname_match = re.search(r'Computer: (\S+)', line) event_id_match = re.search(r'Event Id: (\d+)', line) log_text_match = re.search(r'Message: (.*)Level:', line) level_match = re.search(r'Level: (\d+)', line) channel_match = re.search(r'Channel: (\S+)', line) if timestamp_match and hostname_match and event_id_match and log_text_match and level_match and channel_match: timestamp = timestamp_match.group(1) hostname = hostname_match.group(1) event_id = event_id_match.group(1) log_text = log_text_match.group(1).strip() level = level_match.group(1) channel = channel_match.group(1) hostname_prefix = hostname.split(".")[0] data.append((timestamp, hostname_prefix , channel, event_id, log_text)) # convert the list of tuples to a DataFrame df = pd.DataFrame(data, columns=columns) #print(df.head(6)) return [df]