class ConvertLogFileToDataFrameDoFn(beam.DoFn):
def process(self, element):
file_path = element
print(file_path)
list_of_files.append(file_path)
print(list_of_files)
# read the text file and split each line by comma
with open(file_path, "r") as f:
lines = f.readlines()
# create an empty DataFrame with the specified columns
columns=['timestamp','hostname','process_name','process_id','log_text']
df = pd.DataFrame(columns=columns)
# use regular expressions to extract the values for each column
data = []
for line in lines:
#match = re.search(r'Time: (\S+).*Computer: (\S+).*Event Id: (\d+),Message: (.*)Level: (\d+),Channel: (\S+)', line)
match = re.search(r'Time: (\S+),\s*Computer: (\S+),\s*Event Id: (\d+),\s*Message: ([^,]*).*Level: (\d+),\s*Channel: (\S+)', line)
if match:
timestamp, hostname, event_id, log_text, level, channel = match.groups()
hostname_prefix = hostname.split(".")[0]
data.append((timestamp, hostname_prefix , channel, event_id, log_text.strip()))
# convert the list of tuples to a DataFrame
df = pd.DataFrame(data, columns=columns)
#print(df.head(6))
return [df]