Untitled
unknown
python
a month ago
8.3 kB
5
Indexable
Never
from ReadRFX import ReadRFX import Feature_extraction as fe import pandas as pd import plotly.express as px import numpy as np import matplotlib.pyplot as plt import os from tqdm.auto import tqdm from scipy.signal import find_peaks # from scipy.signal import ZoomFFT from datetime import datetime import glob from multiprocessing import Pool import time import sys def find_rpm(x, fs, distance = 100, vector = True, plot = True): ''' Se vector = True, retorna-se o vetor de rotação, se False, retorna-se a média ''' peaks, _ = find_peaks(x, height=-5, distance=distance, prominence=0.6) distancias_entre_picos = np.diff(peaks) Periodo = distancias_entre_picos*(1/fs) rev_1x = 1/Periodo mean_rev_1x = np.mean(rev_1x) if plot: plt.plot(x) plt.scatter(peaks, x[peaks], color='k') plt.xlabel('Sample') plt.ylabel('Valor de x') plt.grid(True) plt.title(f'Tacômetro {mean_rev_1x} Hz | {mean_rev_1x*60} rpm') plt.show() if vector: return rev_1x else: return mean_rev_1x def frequencyFeatures(gs, sr, speed, df_freqs): ''' # gs: time waveform # sr: sample rate ''' freqs, pxx = fe.espectro_fft(gs, sr) # Frequency resolution delta_f = freqs[1]-freqs[0] frequency_amps = [] for i in df_freqs.index: freq = df_freqs.loc[i]['Frequency'] / 30 * speed # Ajustando a velocidade de rotação da série, no df está 30 Hz (1800 rpm) filt = pxx[(freqs > (freq - 5*delta_f))*(freqs < (freq + 5*delta_f))] if (len(filt)): amp = max(filt) else: amp = np.nan #print(i, freq, amp) frequency_amps.append(amp) return frequency_amps def get_features(gs, speed, fs: np.float32, df_freqs, ndim = 5, n = 3): gs = gs - np.mean(gs) # removendo a componente DC # Features extraction label, item = fe.feature_extraction(gs, ndim, fs, n) freq_amplitudes = frequencyFeatures(gs, fs, speed, df_freqs) item = item + freq_amplitudes item.append(speed) label = label+list(df_freqs['Label']) label.append('Speed') dict = {l: [v] for l, v in zip(label, item)} df = pd.DataFrame(dict) return df ######## NOVO CÓDIGO ######## def extract_dfs(file_path, df_freq_no_duplicates): df_features_400k = pd.DataFrame() df_features_400 = pd.DataFrame() #get the filename from full path filename = os.path.basename(file_path) data_dict = ReadRFX(file_path) dict_pop = data_dict.pop('EA11_REF') x = -np.array(dict_pop['ValuesY']) fs = 1/dict_pop['SamplingInterval'] index_vector = np.arange(0, len(x) * 1/fs, 1/fs) rpm = find_rpm(x, fs, distance=500, vector = False, plot = False) # rpm_list.append(rpm) # Rodrigo - não utilizado # if rpm > 50: # break list_400_datapoints = [] list_40k_datapoints = [] for dic in data_dict: num_data_points = len(data_dict[dic]['ValuesY']) fs = 1/data_dict[dic]['SamplingInterval'] if num_data_points == 400: list_400_datapoints.append(dic) else: list_40k_datapoints.append(dic) for key_40k in list_40k_datapoints: # print(key_40k) x = np.array(data_dict[key_40k]['ValuesY']) fs = 1/data_dict[key_40k]['SamplingInterval'] df_aux = get_features(x, rpm, fs, df_freq_no_duplicates, ndim = 5, n = 3) df_aux['WTG'] = filename[:6] # Extrair a parte da data e hora da string # Dividir a string pelo caractere "_" e pegar a parte da data e hora parts = filename.split('.')[0].split('_') # Extrair a data e a hora (indices 3 a 8) date_str = f"{parts[3]}{parts[4]}{parts[5]}{parts[6]}{parts[7]}" # Converter a string extraída para o formato datetime # O formato é: dia, mês, ano, hora, minuto date_time_obj = datetime.strptime(date_str, '%d%m%Y%H%M') # Padrão de data modificado por Rodrigo df_aux['Date'] = date_time_obj # filename[19:35] df_aux['MeasPoint'] = key_40k df_features_400k = pd.concat([df_features_400k, df_aux]) list_values_400 = [] for key_400 in list_400_datapoints: # print(key_400) x = np.array(data_dict[key_400]['ValuesY']) fs = 1/data_dict[key_400]['SamplingInterval'] list_values_400.append(np.mean(x)) df_aux = pd.DataFrame([list_values_400], columns=list_400_datapoints) df_aux['WTG'] = filename[:6] # Extrair a parte da data e hora da string parts = filename.split('.')[0].split('_') # Extrair a data e a hora (indices 3 a 8) date_str = f"{parts[3]}{parts[4]}{parts[5]}{parts[6]}{parts[7]}" # Converter a string extraída para o formato datetime # O formato é: dia, mês, ano, hora, minuto date_time_obj = datetime.strptime(date_str, '%d%m%Y%H%M') # Padrão de data modificado por Rodrigo df_aux['Date'] = date_time_obj # filename[19:35] #df_features_400 = pd.concat([df_features_400, df_aux]) df_features_400 = df_aux return df_features_400k, df_features_400 ######## NOVO CÓDIGO ######## def runExtract(full_path, df_freq_no_duplicates): print("Processing file: ", full_path) sys.stdout.flush() return extract_dfs(full_path, df_freq_no_duplicates) ######## END NOVO CÓDIGO ######## if __name__ == '__main__': start_time = time.time() df_freq = pd.read_csv('/home/rcls/codigos/git-projects/deteccao-diagnostico-cms/GDRIVE/report10/cpfl_suzlon_characteristic_frequencies.csv') df_freq['Label'] = df_freq['PartName'] + '|' + df_freq['sPar1'] df_freq[['Label', 'Ratio', 'Frequency']].dropna() df_freq_selected = df_freq[['Label', 'Ratio', 'Frequency']].dropna() df_freq_selected = df_freq_selected[~df_freq_selected['Label'].str.contains('2SB|3SB|FTF|3X')] # Defina o diretório onde os arquivos CSV estão localizados directory = "/home/rcls/codigos/git-projects/deteccao-diagnostico-cms/src/scripts/features/data" # Inicialize uma lista vazia para armazenar os dataframes rpm_list = [] aux_list = [] df_features_400k = pd.DataFrame() df_features_400 = pd.DataFrame() df_aux = pd.DataFrame() #lista de frequências df_freq_no_duplicates = df_freq_selected[['Label', 'Ratio', 'Frequency']].drop_duplicates(subset=['Label'], inplace=False) # Percorra todos os arquivos no diretório #files = os.listdir(directory) files = glob.glob(directory + os.sep + "*.rfx") # for filename in tqdm(os.listdir(directory),desc="Inner Loop", leave=False): n_jump = 1 # pula quantos arquivos # SERIAL # for i in tqdm(range(0, len(files), n_jump), desc="File Loop", position=0, leave=True): # 1 para percorrer todos os arquivos # full_path = files[i] # # Verifique se o arquivo é um rfx # df400k, df400 = extract_dfs(full_path, df_freq_no_duplicates) # df_features_400 = pd.concat([df_features_400, df400]) # df_features_400k = pd.concat([df_features_400k, df400k]) # # Imprima uma mensagem indicando que o arquivo foi lido # print(f'{full_path} foi lido.') #END SERIAL ######## NOVO CÓDIGO ######## # PARALLEL number_threads = 8 # Número de threads rodando em paralelo pool = Pool(number_threads) results_df = pool.starmap(runExtract, zip(files, [df_freq_no_duplicates]*len(files))) pool.close() pool.join() #END PARALLEL # concatene all the first positions of the tuple df_features_400k = pd.concat([x[0] for x in results_df]) # concatene all the second positions of the tuple df_features_400 = pd.concat([x[1] for x in results_df]) ######## END NOVO CÓDIGO ######## # Nome do estudo case_study = 'Rel_setembro' print("./df_features_400k_all_"+case_study+".csv") df_features_400k.to_csv("./df_features_400k_"+case_study+".csv") df_features_400.to_csv("./df_features_400_"+case_study+".csv") # Imprima uma mensagem indicando que todos os arquivos CSV foram lidos print('Todos os arquivos RFX no diretório foram lidos.') print("--- %s seconds ---" % (time.time() - start_time))
Leave a Comment