Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
3.8 kB
1
Indexable
Never
class ConvertLogFileToDataFrameDoFn(beam.DoFn):
    def process(self, element):
        file_path = element
        print(file_path)
        list_of_files.append(file_path)
        #print(list_of_files)
        # read the text file and split each line by comma
        with open(file_path, "r") as f:
            lines = f.readlines()

        # create an empty DataFrame with the specified columns
        columns=['timestamp','hostname','process_name','process_id','log_text']
        df = pd.DataFrame(columns=columns)

        # use regular expressions to extract the values for each column

        data = []
        for line in lines:
            match = re.search(r'Time: (\S+).*Computer: (\S+).*Event Id: (\d+),Message: (.*)Level: (\d+),Channel: (\S+)', line)
            if match:
                timestamp, hostname, event_id, log_text, level, channel = match.groups()
                filtered_timestamp = timestamp.replace(',','')
                hostname_prefix = hostname.split(".")[0]
                filtered_log_text=log_text.replace(',','')
                data.append((filtered_timestamp, hostname_prefix , channel, event_id, filtered_log_text.strip()))

        # convert the list of tuples to a DataFrame
        df = pd.DataFrame(data, columns=columns)
        #print(df.head(6))
        ## Write code to filter out records with keywords-                    ## Changes made here

        return [df]

    
class SplitAndSaveFilesDoFn(beam.DoFn):
    def __init__(self, output_dir):
        self.output_dir = output_dir
        self.processed_files = set()

    def process(self, element):
        print(list_of_files)
        df = element
        for hostname, group in df.groupby('hostname'):
            # get the first part of the hostname before the first dot
            hostname_prefix = hostname.split(".")[0]
            print(hostname_prefix)
            filename = str(hostname_prefix)+ "-windows-postgre" +"-T" + str(current_time) + ".log"    ## Changes made here
            #print(filename)
            gz_filename=filename + ".gz"
            #print(gz_filename)
            output_filename = os.path.join(self.output_dir, gz_filename)
            group.to_csv(filename,header=False , index=False)
            # Gzip the file and give it chmod 777 permissions
            with open(filename, 'rb') as f_in:
                with gzip.open(output_filename, 'wb') as f_out:
                    f_out.writelines(f_in)
            os.chmod(output_filename, 0o777)

            # Remove the original file
            os.remove(filename)

            # Rename the successfully processed files
            for file1 in list_of_files:
                if file1 in self.processed_files:
                    continue
                processed_filename = folder_path + "Processed_logs"+os.path.basename(file1)[14:]
                try:
                    os.rename(file1,processed_filename)
                except FileNotFoundError:
                    print(f"File not found: {file1}")
                    continue
                self.processed_files.add(file1)

        return []

                       
with beam.Pipeline() as pipeline:
    data_frames = (pipeline
                   | "Read Input Files" >> beam.Create(os.listdir(folder_path))
                   | "Filter Log Files" >> beam.Filter(lambda x: x.startswith("Processed_logs_cassandra"))
                   | "Get Full File Path" >> beam.Map(lambda x: os.path.join(folder_path, x))
                   | "Convert Log Data into Dataframe" >> beam.ParDo(ConvertLogFileToDataFrameDoFn())
                   | "Merge Dataframes" >> beam.CombineGlobally(pd.concat)
                   | "Split and Save Files" >> beam.ParDo(SplitAndSaveFilesDoFn(folder_path)))