Untitled

data_location = "E:/data_center/tracks/"
import os
import io
import rarfile
import re
months = ['فروردین', 'اردیبهشت', 'خرداد', 'تیر', 'مرداد', 'شهریور', 'مهر', 'آبان', 'آذر', 'دی', 'بهمن', 'اسفند']

import pandas as pd
import numpy as np 

from persiantools.jdatetime import JalaliDate, JalaliDateTime
import matplotlib.pyplot as plt
from matplotlib.cbook import boxplot_stats
import seaborn as sns
#%%
def extract_code_and_name(input_string):
    pattern = re.compile(r'(\d{6})\D+\u202b(.+?)\u202c')
    match = pattern.search(input_string)
    if match:
        code_mahvar, esm = match.groups()
        return code_mahvar, esm
    return None, None


def get_mehvars_of_ostan(ostan=''):
    mehvars = {}
    for month in months:
        for root, dirs, files in os.walk(data_location + '1395/' + month):
            print(root)
            for file in files:
                if ostan in file and file.endswith('.rar'):
                    rf = rarfile.RarFile(data_location + '1395/' + month + '/' + file)
                
                    # can be 'ساعتی'
                    daily =  [x for x in rf.infolist() if (not x.is_dir()) and 'روزانه' in x.filename]
                
                    if len(daily) == 0:
                        print('no daily exist in loc: ' + data_location + '1395/' + month + '/' + file)
                        assert(False)
                    
                    for x in daily:
                        code, name = extract_code_and_name(x.filename)
                        mehvars[code] = name
                    #print([extract_code_and_name(x.filename) for x in daily if 'اصفهان -' in x.filename])
                    # x.filename contain other information like path اصفهان - فلاورجان 
            #        esfahan_felavarjan = [x for x in daily if 'اصفهان - زرين شهر' in x.filename]
    return mehvars
#%%

def get_df_of_mehvar(code, ostan=''):
    result_df = pd.DataFrame()

    for month in months:
        for root, dirs, files in os.walk(data_location + '1395/' + month):
            print(root)
            for file in files:
                if ostan in file and file.endswith('.rar'):
                    rf = rarfile.RarFile(data_location + '1395/' + month + '/' + file)
                
                    # can be 'ساعتی'
                    daily =  [x for x in rf.infolist() if (not x.is_dir()) and 'روزانه' in x.filename]
                    
                    if len(daily) == 0:
                        print('no daily exist in loc: '+data_location + '1395/' + month + '/' + file)
                        assert(False)
                
                    #print([x.filename for x in daily if 'امامزاده هاشم' in x.filename])
                    # x.filename contain other information like path  
                    mehvar = [x for x in daily if str(code) in x.filename]
                    if len(mehvar) == 0:
                        print('cant find ' + code + ' in month ' + month)
                        return None
                        
                    file = rf.open(mehvar[0]).read()
                    excel_file = io.BytesIO(file)
                    df = pd.read_excel(excel_file)
                    result_df = pd.concat([result_df, df], ignore_index=True)
    return result_df

def clean_df(df):
    df.columns = ['code', 'name', 'start', 'end', 'total_time', 'all_cars', 'cars_1', 'cars_2', 'cars_3', 'cars_4', 'cars_5', 
                         'average_speed', 'speed_viol', 'dis_viol', 'over_viol', 'total_pred']
    numeric_cols = df.columns.drop(['name', 'start', 'end'])
    df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')
    df = df.dropna(subset=['code']) #drop rows that don't have code
    #replace JalaliDate with JalaliDateTime if you wan't time field too 
    
    #fix errors
    df.loc[(df['total_time'] < 1400) & ~df['total_pred'].isna(), 'all_cars'] = df['total_pred']
    df.loc[(df['total_time'] < 1400) & df['total_pred'].isna(), 'all_cars'] = df['all_cars'] * (1440 / df['total_time'])
    df['jalali_start'] = df['start'].apply(lambda x: JalaliDate.fromisoformat(x.replace('/', '-')))
    df['month'] = df['jalali_start'].apply(lambda x: x.month)
    return df
#%%
def sketch_time_series(df, name_of_mehvar, time, var, x_ticks=None):
    plt.figure(figsize=(10, 6))
    plt.plot(df[time].apply(lambda x: JalaliDate.to_gregorian(x)), df[var], marker='o', linestyle='-', color='b')
    plt.title('Time Series Plot of '+ name_of_mehvar)
    plt.xlabel('Time')
    plt.ylabel(var)
    plt.grid(True)
    if x_ticks:
        plt.xticks(x_ticks)
    plt.show()
def sketch_distrubution(df, name_of_mehvar, var):
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=df['jalali_start'].apply(lambda x: x.month), y=df[var])
    plt.title('Box Plot of ' + name_of_mehvar)
    plt.xlabel(var)
    plt.ylabel('Values of ' + var)
    plt.show()
#%%
month_grids = [JalaliDate(1395, i, 1).to_gregorian() for i in range(1, 13)]
mehvars = get_mehvars_of_ostan('تهران')
missing = []
df = pd.DataFrame()
#%%
not_ordinal_days = {}
def count_outlier_of_days(df):
    box = boxplot_stats(df['all_cars'])[0]
    whisker_low = box['whislo']
    whisker_high = box['whishi']
    outliar_days = df[((df['all_cars'] < whisker_low) | (df['all_cars'] > whisker_high))]['jalali_start']
    print(outliar_days)
   # for d in outliar_days:
   #     print(d)
    #    st = d.strftime('%Y/%m/%d')
     #   if st in not_ordinal_days:
      #      not_ordinal_days[st] += 1
       # else:
        #    not_ordinal_days[st] = 1

#%%
for code, name in mehvars.items():
    if int(code) <= 114552:
        continue
    df = get_df_of_mehvar(code, 'تهران')
    if df is None:
        missing.append(code)
        continue
    df = clean_df(df)
    sketch_distrubution(df, name, 'all_cars')
    sketch_time_series(df, name, 'jalali_start', 'all_cars', x_ticks=month_grids)
    df.groupby('month').apply(count_outlier_of_days)

#%%
for x in not_ordinal_days.keys():
    not_ordinal_days[x] = (not_ordinal_days[x] / 89) * 100
    
check = sorted(not_ordinal_days.items(), key=lambda item: item[1])
Editor is loading...