Untitled

mail@pastecode.io avatar
unknown
plain_text
2 years ago
1.8 kB
1
Indexable
Never
class ConvertLogFileToDataFrameDoFn(beam.DoFn):
    def process(self, element):
        file_path = element
        print(file_path)
        list_of_files.append(file_path)
        print(list_of_files)
        # read the text file and split each line by comma
        with open(file_path, "r") as f:
            lines = f.readlines()

        # create an empty DataFrame with the specified columns
        columns=['timestamp','hostname','process_name','process_id','log_text']
        df = pd.DataFrame(columns=columns)

        # use regular expressions to extract the values for each column

        data = []
        for line in lines:
            timestamp_match = re.search(r'Time: (\S+)', line)
            hostname_match = re.search(r'Computer: (\S+)', line)
            event_id_match = re.search(r'Event Id: (\d+)', line)
            log_text_match = re.search(r'Message: (.*)Level:', line)
            level_match = re.search(r'Level: (\d+)', line)
            channel_match = re.search(r'Channel: (\S+)', line)

            if timestamp_match and hostname_match and event_id_match and log_text_match and level_match and channel_match:
                timestamp = timestamp_match.group(1)
                hostname = hostname_match.group(1)
                event_id = event_id_match.group(1)
                log_text = log_text_match.group(1).strip()
                level = level_match.group(1)
                channel = channel_match.group(1)
                hostname_prefix = hostname.split(".")[0]
                data.append((timestamp, hostname_prefix , channel, event_id, log_text))

        # convert the list of tuples to a DataFrame
        df = pd.DataFrame(data, columns=columns)
        #print(df.head(6))

        return [df]