Untitled
unknown
python
a year ago
6.4 kB
4
Indexable
data_location = "E:/data_center/tracks/" import os import io import rarfile import re months = ['فروردین', 'اردیبهشت', 'خرداد', 'تیر', 'مرداد', 'شهریور', 'مهر', 'آبان', 'آذر', 'دی', 'بهمن', 'اسفند'] import pandas as pd import numpy as np from persiantools.jdatetime import JalaliDate, JalaliDateTime import matplotlib.pyplot as plt from matplotlib.cbook import boxplot_stats import seaborn as sns #%% def extract_code_and_name(input_string): pattern = re.compile(r'(\d{6})\D+\u202b(.+?)\u202c') match = pattern.search(input_string) if match: code_mahvar, esm = match.groups() return code_mahvar, esm return None, None def get_mehvars_of_ostan(ostan=''): mehvars = {} for month in months: for root, dirs, files in os.walk(data_location + '1395/' + month): print(root) for file in files: if ostan in file and file.endswith('.rar'): rf = rarfile.RarFile(data_location + '1395/' + month + '/' + file) # can be 'ساعتی' daily = [x for x in rf.infolist() if (not x.is_dir()) and 'روزانه' in x.filename] if len(daily) == 0: print('no daily exist in loc: ' + data_location + '1395/' + month + '/' + file) assert(False) for x in daily: code, name = extract_code_and_name(x.filename) mehvars[code] = name #print([extract_code_and_name(x.filename) for x in daily if 'اصفهان -' in x.filename]) # x.filename contain other information like path اصفهان - فلاورجان # esfahan_felavarjan = [x for x in daily if 'اصفهان - زرين شهر' in x.filename] return mehvars #%% def get_df_of_mehvar(code, ostan=''): result_df = pd.DataFrame() for month in months: for root, dirs, files in os.walk(data_location + '1395/' + month): print(root) for file in files: if ostan in file and file.endswith('.rar'): rf = rarfile.RarFile(data_location + '1395/' + month + '/' + file) # can be 'ساعتی' daily = [x for x in rf.infolist() if (not x.is_dir()) and 'روزانه' in x.filename] if len(daily) == 0: print('no daily exist in loc: '+data_location + '1395/' + month + '/' + file) assert(False) #print([x.filename for x in daily if 'امامزاده هاشم' in x.filename]) # x.filename contain other information like path mehvar = [x for x in daily if str(code) in x.filename] if len(mehvar) == 0: print('cant find ' + code + ' in month ' + month) return None file = rf.open(mehvar[0]).read() excel_file = io.BytesIO(file) df = pd.read_excel(excel_file) result_df = pd.concat([result_df, df], ignore_index=True) return result_df def clean_df(df): df.columns = ['code', 'name', 'start', 'end', 'total_time', 'all_cars', 'cars_1', 'cars_2', 'cars_3', 'cars_4', 'cars_5', 'average_speed', 'speed_viol', 'dis_viol', 'over_viol', 'total_pred'] numeric_cols = df.columns.drop(['name', 'start', 'end']) df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce') df = df.dropna(subset=['code']) #drop rows that don't have code #replace JalaliDate with JalaliDateTime if you wan't time field too #fix errors df.loc[(df['total_time'] < 1400) & ~df['total_pred'].isna(), 'all_cars'] = df['total_pred'] df.loc[(df['total_time'] < 1400) & df['total_pred'].isna(), 'all_cars'] = df['all_cars'] * (1440 / df['total_time']) df['jalali_start'] = df['start'].apply(lambda x: JalaliDate.fromisoformat(x.replace('/', '-'))) df['month'] = df['jalali_start'].apply(lambda x: x.month) return df #%% def sketch_time_series(df, name_of_mehvar, time, var, x_ticks=None): plt.figure(figsize=(10, 6)) plt.plot(df[time].apply(lambda x: JalaliDate.to_gregorian(x)), df[var], marker='o', linestyle='-', color='b') plt.title('Time Series Plot of '+ name_of_mehvar) plt.xlabel('Time') plt.ylabel(var) plt.grid(True) if x_ticks: plt.xticks(x_ticks) plt.show() def sketch_distrubution(df, name_of_mehvar, var): plt.figure(figsize=(8, 6)) sns.boxplot(x=df['jalali_start'].apply(lambda x: x.month), y=df[var]) plt.title('Box Plot of ' + name_of_mehvar) plt.xlabel(var) plt.ylabel('Values of ' + var) plt.show() #%% month_grids = [JalaliDate(1395, i, 1).to_gregorian() for i in range(1, 13)] mehvars = get_mehvars_of_ostan('تهران') missing = [] df = pd.DataFrame() #%% not_ordinal_days = {} def count_outlier_of_days(df): box = boxplot_stats(df['all_cars'])[0] whisker_low = box['whislo'] whisker_high = box['whishi'] outliar_days = df[((df['all_cars'] < whisker_low) | (df['all_cars'] > whisker_high))]['jalali_start'] print(outliar_days) # for d in outliar_days: # print(d) # st = d.strftime('%Y/%m/%d') # if st in not_ordinal_days: # not_ordinal_days[st] += 1 # else: # not_ordinal_days[st] = 1 #%% for code, name in mehvars.items(): if int(code) <= 114552: continue df = get_df_of_mehvar(code, 'تهران') if df is None: missing.append(code) continue df = clean_df(df) sketch_distrubution(df, name, 'all_cars') sketch_time_series(df, name, 'jalali_start', 'all_cars', x_ticks=month_grids) df.groupby('month').apply(count_outlier_of_days) #%% for x in not_ordinal_days.keys(): not_ordinal_days[x] = (not_ordinal_days[x] / 89) * 100 check = sorted(not_ordinal_days.items(), key=lambda item: item[1])
Editor is loading...
Leave a Comment