Untitled
unknown
python
a year ago
8.3 kB
11
Indexable
from ReadRFX import ReadRFX
import Feature_extraction as fe
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm.auto import tqdm
from scipy.signal import find_peaks
# from scipy.signal import ZoomFFT
from datetime import datetime
import glob
from multiprocessing import Pool
import time
import sys
def find_rpm(x, fs, distance = 100, vector = True, plot = True):
'''
Se vector = True, retorna-se o vetor de rotação, se False, retorna-se a média
'''
peaks, _ = find_peaks(x, height=-5, distance=distance, prominence=0.6)
distancias_entre_picos = np.diff(peaks)
Periodo = distancias_entre_picos*(1/fs)
rev_1x = 1/Periodo
mean_rev_1x = np.mean(rev_1x)
if plot:
plt.plot(x)
plt.scatter(peaks, x[peaks], color='k')
plt.xlabel('Sample')
plt.ylabel('Valor de x')
plt.grid(True)
plt.title(f'Tacômetro {mean_rev_1x} Hz | {mean_rev_1x*60} rpm')
plt.show()
if vector:
return rev_1x
else:
return mean_rev_1x
def frequencyFeatures(gs, sr, speed, df_freqs):
'''
# gs: time waveform
# sr: sample rate
'''
freqs, pxx = fe.espectro_fft(gs, sr)
# Frequency resolution
delta_f = freqs[1]-freqs[0]
frequency_amps = []
for i in df_freqs.index:
freq = df_freqs.loc[i]['Frequency'] / 30 * speed # Ajustando a velocidade de rotação da série, no df está 30 Hz (1800 rpm)
filt = pxx[(freqs > (freq - 5*delta_f))*(freqs < (freq + 5*delta_f))]
if (len(filt)):
amp = max(filt)
else:
amp = np.nan
#print(i, freq, amp)
frequency_amps.append(amp)
return frequency_amps
def get_features(gs, speed, fs: np.float32, df_freqs, ndim = 5, n = 3):
gs = gs - np.mean(gs) # removendo a componente DC
# Features extraction
label, item = fe.feature_extraction(gs, ndim, fs, n)
freq_amplitudes = frequencyFeatures(gs, fs, speed, df_freqs)
item = item + freq_amplitudes
item.append(speed)
label = label+list(df_freqs['Label'])
label.append('Speed')
dict = {l: [v] for l, v in zip(label, item)}
df = pd.DataFrame(dict)
return df
######## NOVO CÓDIGO ########
def extract_dfs(file_path, df_freq_no_duplicates):
df_features_400k = pd.DataFrame()
df_features_400 = pd.DataFrame()
#get the filename from full path
filename = os.path.basename(file_path)
data_dict = ReadRFX(file_path)
dict_pop = data_dict.pop('EA11_REF')
x = -np.array(dict_pop['ValuesY'])
fs = 1/dict_pop['SamplingInterval']
index_vector = np.arange(0, len(x) * 1/fs, 1/fs)
rpm = find_rpm(x, fs, distance=500, vector = False, plot = False)
# rpm_list.append(rpm) # Rodrigo - não utilizado
# if rpm > 50:
# break
list_400_datapoints = []
list_40k_datapoints = []
for dic in data_dict:
num_data_points = len(data_dict[dic]['ValuesY'])
fs = 1/data_dict[dic]['SamplingInterval']
if num_data_points == 400:
list_400_datapoints.append(dic)
else:
list_40k_datapoints.append(dic)
for key_40k in list_40k_datapoints:
# print(key_40k)
x = np.array(data_dict[key_40k]['ValuesY'])
fs = 1/data_dict[key_40k]['SamplingInterval']
df_aux = get_features(x, rpm, fs, df_freq_no_duplicates, ndim = 5, n = 3)
df_aux['WTG'] = filename[:6]
# Extrair a parte da data e hora da string
# Dividir a string pelo caractere "_" e pegar a parte da data e hora
parts = filename.split('.')[0].split('_')
# Extrair a data e a hora (indices 3 a 8)
date_str = f"{parts[3]}{parts[4]}{parts[5]}{parts[6]}{parts[7]}"
# Converter a string extraída para o formato datetime
# O formato é: dia, mês, ano, hora, minuto
date_time_obj = datetime.strptime(date_str, '%d%m%Y%H%M') # Padrão de data modificado por Rodrigo
df_aux['Date'] = date_time_obj # filename[19:35]
df_aux['MeasPoint'] = key_40k
df_features_400k = pd.concat([df_features_400k, df_aux])
list_values_400 = []
for key_400 in list_400_datapoints:
# print(key_400)
x = np.array(data_dict[key_400]['ValuesY'])
fs = 1/data_dict[key_400]['SamplingInterval']
list_values_400.append(np.mean(x))
df_aux = pd.DataFrame([list_values_400], columns=list_400_datapoints)
df_aux['WTG'] = filename[:6]
# Extrair a parte da data e hora da string
parts = filename.split('.')[0].split('_')
# Extrair a data e a hora (indices 3 a 8)
date_str = f"{parts[3]}{parts[4]}{parts[5]}{parts[6]}{parts[7]}"
# Converter a string extraída para o formato datetime
# O formato é: dia, mês, ano, hora, minuto
date_time_obj = datetime.strptime(date_str, '%d%m%Y%H%M') # Padrão de data modificado por Rodrigo
df_aux['Date'] = date_time_obj # filename[19:35]
#df_features_400 = pd.concat([df_features_400, df_aux])
df_features_400 = df_aux
return df_features_400k, df_features_400
######## NOVO CÓDIGO ########
def runExtract(full_path, df_freq_no_duplicates):
print("Processing file: ", full_path)
sys.stdout.flush()
return extract_dfs(full_path, df_freq_no_duplicates)
######## END NOVO CÓDIGO ########
if __name__ == '__main__':
start_time = time.time()
df_freq = pd.read_csv('/home/rcls/codigos/git-projects/deteccao-diagnostico-cms/GDRIVE/report10/cpfl_suzlon_characteristic_frequencies.csv')
df_freq['Label'] = df_freq['PartName'] + '|' + df_freq['sPar1']
df_freq[['Label', 'Ratio', 'Frequency']].dropna()
df_freq_selected = df_freq[['Label', 'Ratio', 'Frequency']].dropna()
df_freq_selected = df_freq_selected[~df_freq_selected['Label'].str.contains('2SB|3SB|FTF|3X')]
# Defina o diretório onde os arquivos CSV estão localizados
directory = "/home/rcls/codigos/git-projects/deteccao-diagnostico-cms/src/scripts/features/data"
# Inicialize uma lista vazia para armazenar os dataframes
rpm_list = []
aux_list = []
df_features_400k = pd.DataFrame()
df_features_400 = pd.DataFrame()
df_aux = pd.DataFrame()
#lista de frequências
df_freq_no_duplicates = df_freq_selected[['Label', 'Ratio', 'Frequency']].drop_duplicates(subset=['Label'], inplace=False)
# Percorra todos os arquivos no diretório
#files = os.listdir(directory)
files = glob.glob(directory + os.sep + "*.rfx")
# for filename in tqdm(os.listdir(directory),desc="Inner Loop", leave=False):
n_jump = 1 # pula quantos arquivos
# SERIAL
# for i in tqdm(range(0, len(files), n_jump), desc="File Loop", position=0, leave=True): # 1 para percorrer todos os arquivos
# full_path = files[i]
# # Verifique se o arquivo é um rfx
# df400k, df400 = extract_dfs(full_path, df_freq_no_duplicates)
# df_features_400 = pd.concat([df_features_400, df400])
# df_features_400k = pd.concat([df_features_400k, df400k])
# # Imprima uma mensagem indicando que o arquivo foi lido
# print(f'{full_path} foi lido.')
#END SERIAL
######## NOVO CÓDIGO ########
# PARALLEL
number_threads = 8 # Número de threads rodando em paralelo
pool = Pool(number_threads)
results_df = pool.starmap(runExtract, zip(files, [df_freq_no_duplicates]*len(files)))
pool.close()
pool.join()
#END PARALLEL
# concatene all the first positions of the tuple
df_features_400k = pd.concat([x[0] for x in results_df])
# concatene all the second positions of the tuple
df_features_400 = pd.concat([x[1] for x in results_df])
######## END NOVO CÓDIGO ########
# Nome do estudo
case_study = 'Rel_setembro'
print("./df_features_400k_all_"+case_study+".csv")
df_features_400k.to_csv("./df_features_400k_"+case_study+".csv")
df_features_400.to_csv("./df_features_400_"+case_study+".csv")
# Imprima uma mensagem indicando que todos os arquivos CSV foram lidos
print('Todos os arquivos RFX no diretório foram lidos.')
print("--- %s seconds ---" % (time.time() - start_time))
Editor is loading...
Leave a Comment