Untitled
unknown
python
2 years ago
6.4 kB
7
Indexable
data_location = "E:/data_center/tracks/"
import os
import io
import rarfile
import re
months = ['فروردین', 'اردیبهشت', 'خرداد', 'تیر', 'مرداد', 'شهریور', 'مهر', 'آبان', 'آذر', 'دی', 'بهمن', 'اسفند']
import pandas as pd
import numpy as np
from persiantools.jdatetime import JalaliDate, JalaliDateTime
import matplotlib.pyplot as plt
from matplotlib.cbook import boxplot_stats
import seaborn as sns
#%%
def extract_code_and_name(input_string):
pattern = re.compile(r'(\d{6})\D+\u202b(.+?)\u202c')
match = pattern.search(input_string)
if match:
code_mahvar, esm = match.groups()
return code_mahvar, esm
return None, None
def get_mehvars_of_ostan(ostan=''):
mehvars = {}
for month in months:
for root, dirs, files in os.walk(data_location + '1395/' + month):
print(root)
for file in files:
if ostan in file and file.endswith('.rar'):
rf = rarfile.RarFile(data_location + '1395/' + month + '/' + file)
# can be 'ساعتی'
daily = [x for x in rf.infolist() if (not x.is_dir()) and 'روزانه' in x.filename]
if len(daily) == 0:
print('no daily exist in loc: ' + data_location + '1395/' + month + '/' + file)
assert(False)
for x in daily:
code, name = extract_code_and_name(x.filename)
mehvars[code] = name
#print([extract_code_and_name(x.filename) for x in daily if 'اصفهان -' in x.filename])
# x.filename contain other information like path اصفهان - فلاورجان
# esfahan_felavarjan = [x for x in daily if 'اصفهان - زرين شهر' in x.filename]
return mehvars
#%%
def get_df_of_mehvar(code, ostan=''):
result_df = pd.DataFrame()
for month in months:
for root, dirs, files in os.walk(data_location + '1395/' + month):
print(root)
for file in files:
if ostan in file and file.endswith('.rar'):
rf = rarfile.RarFile(data_location + '1395/' + month + '/' + file)
# can be 'ساعتی'
daily = [x for x in rf.infolist() if (not x.is_dir()) and 'روزانه' in x.filename]
if len(daily) == 0:
print('no daily exist in loc: '+data_location + '1395/' + month + '/' + file)
assert(False)
#print([x.filename for x in daily if 'امامزاده هاشم' in x.filename])
# x.filename contain other information like path
mehvar = [x for x in daily if str(code) in x.filename]
if len(mehvar) == 0:
print('cant find ' + code + ' in month ' + month)
return None
file = rf.open(mehvar[0]).read()
excel_file = io.BytesIO(file)
df = pd.read_excel(excel_file)
result_df = pd.concat([result_df, df], ignore_index=True)
return result_df
def clean_df(df):
df.columns = ['code', 'name', 'start', 'end', 'total_time', 'all_cars', 'cars_1', 'cars_2', 'cars_3', 'cars_4', 'cars_5',
'average_speed', 'speed_viol', 'dis_viol', 'over_viol', 'total_pred']
numeric_cols = df.columns.drop(['name', 'start', 'end'])
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
df = df.dropna(subset=['code']) #drop rows that don't have code
#replace JalaliDate with JalaliDateTime if you wan't time field too
#fix errors
df.loc[(df['total_time'] < 1400) & ~df['total_pred'].isna(), 'all_cars'] = df['total_pred']
df.loc[(df['total_time'] < 1400) & df['total_pred'].isna(), 'all_cars'] = df['all_cars'] * (1440 / df['total_time'])
df['jalali_start'] = df['start'].apply(lambda x: JalaliDate.fromisoformat(x.replace('/', '-')))
df['month'] = df['jalali_start'].apply(lambda x: x.month)
return df
#%%
def sketch_time_series(df, name_of_mehvar, time, var, x_ticks=None):
plt.figure(figsize=(10, 6))
plt.plot(df[time].apply(lambda x: JalaliDate.to_gregorian(x)), df[var], marker='o', linestyle='-', color='b')
plt.title('Time Series Plot of '+ name_of_mehvar)
plt.xlabel('Time')
plt.ylabel(var)
plt.grid(True)
if x_ticks:
plt.xticks(x_ticks)
plt.show()
def sketch_distrubution(df, name_of_mehvar, var):
plt.figure(figsize=(8, 6))
sns.boxplot(x=df['jalali_start'].apply(lambda x: x.month), y=df[var])
plt.title('Box Plot of ' + name_of_mehvar)
plt.xlabel(var)
plt.ylabel('Values of ' + var)
plt.show()
#%%
month_grids = [JalaliDate(1395, i, 1).to_gregorian() for i in range(1, 13)]
mehvars = get_mehvars_of_ostan('تهران')
missing = []
df = pd.DataFrame()
#%%
not_ordinal_days = {}
def count_outlier_of_days(df):
box = boxplot_stats(df['all_cars'])[0]
whisker_low = box['whislo']
whisker_high = box['whishi']
outliar_days = df[((df['all_cars'] < whisker_low) | (df['all_cars'] > whisker_high))]['jalali_start']
print(outliar_days)
# for d in outliar_days:
# print(d)
# st = d.strftime('%Y/%m/%d')
# if st in not_ordinal_days:
# not_ordinal_days[st] += 1
# else:
# not_ordinal_days[st] = 1
#%%
for code, name in mehvars.items():
if int(code) <= 114552:
continue
df = get_df_of_mehvar(code, 'تهران')
if df is None:
missing.append(code)
continue
df = clean_df(df)
sketch_distrubution(df, name, 'all_cars')
sketch_time_series(df, name, 'jalali_start', 'all_cars', x_ticks=month_grids)
df.groupby('month').apply(count_outlier_of_days)
#%%
for x in not_ordinal_days.keys():
not_ordinal_days[x] = (not_ordinal_days[x] / 89) * 100
check = sorted(not_ordinal_days.items(), key=lambda item: item[1])
Editor is loading...
Leave a Comment