scenes_columns_ints_romans_titles
unknown
python
2 years ago
3.7 kB
6
Indexable
# importing pandas as pd import pandas as pd # importing re for regular expressions import re # importing csv for writing to csv import csv # importing zip_longest for iterating over rows from itertools import zip_longest # Getting All Column and Rows Data df = pd.read_csv("datafolder/data.csv") # Getting the "Line" Column df2 = df["Line"] # Getting the "Index" Column idx = df["Index"] # Creating the Df Index from the "Index" Column df.index = idx players = [ "KING", "FERDINAND", "BEROWNE", "LONGAVILLE", "DUMAIN", "BOYET", "MARCADE", "DON ADRIANO DE ARMADO", "SIR NATHANIEL", "HOLOFERNES", "DULL", "COSTARD", "MOTH", "A FORESTER", "THE PRINCESS OF FRANCE", "ROSALINE", "MARIA", "KATHARINE", "JAQUENETTA", "LORDS", "ATTENDANTS", ] ### — PLAYER DATA — ### # extract groups having the strings in the players list with ending period players_col = df["Line"].str.extract(pat=f"^({'{0}|'.join(players)})".format("\.")) players_col.ffill(inplace=True) ### — SCENES TITLES STRINGS EXTRACT — ### "((\s|^)SCENE\s[A-Z]{1,7}\.)" # extract groups having the strings in the scenes list with ending period ti_scns = df["Line"].str.extract(pat="((\s|^)SCENE\s[A-Z]{1,7}\.)").ffill().fillna(0) ### — SCENES ROMAN TO INTEGER EXTRACT — "SCENE\s([A-Z]{1,7})\.$" # extract roman numerals in the scenes strings ro_scns = df["Line"].str.extract(pat="SCENE\s([A-Z]{1,7})\.$").ffill().fillna(0) ### Convert Roman numerals from Scene Column into integers ### def rn_to_int(s): d = {"m": 1000, "d": 500, "c": 100, "l": 50, "x": 10, "v": 5, "i": 1} n = [d[i] for i in s.lower() if i in d] return sum([i if i >= n[min(j + 1, len(n) - 1)] else -i for j, i in enumerate(n)]) for numeral, expected in [["CLXIV", 164], ["MDCCLXXXIII", 1783], ["xiv", 14]]: assert rn_to_int(numeral) == expected ### Get the 1st column from ro_scns as an indexless series ### # ... to only process the roman numeral without the index # in rn_to_int() col_ri = [] for i in range(len(ro_scns)): idxless_col = ro_scns.iloc[i, 0] try: value = str(idxless_col) except ValueError: print("Input is not a string type") pass col_ri.append(rn_to_int(value)) #### convert Series to DataFrames ### df_Idx = pd.DataFrame(df["Index"]) # df_scns = df["SceneIntegers"] # Roman to Ints df_ri_scns = pd.DataFrame(col_ri) # Creating the Df Index from the "Index" Column df_ri_scns.index = idx # df_scn = df["SceneRomans"] # Roman strings df_ro_scns = pd.DataFrame(ro_scns[0]) # df_scn = df["SceneTitles"] # Titles strings df_ti_scns = pd.DataFrame(ti_scns[0]) # df_plrs = df["Player"] df_plrs = pd.DataFrame(players_col[0]) # df_lnes = df["Line"] df_lnes = pd.DataFrame(df["Line"]) #### create an Empty DataFrame object #### df_cols = pd.DataFrame() #### append columns to an empty DataFrame #### df_cols["Index"] = df_Idx df_cols["SceneIntegers"] = df_ri_scns df_cols["SceneRomans"] = df_ro_scns df_cols["SceneTitles"] = df_ti_scns df_cols["Player"] = df_plrs df_cols["Line"] = df_lnes ### Set data = [ df_cols["Index"], df_cols["SceneIntegers"], df_cols["SceneRomans"], df_cols["SceneTitles"], df_cols["Player"], df_cols["Line"], ] columns_data = zip_longest(*data) #### Create new csv output #### with open("file.csv", "w", newline="") as csvfile: writer = csv.writer(csvfile) writer.writerow( ["Index", "SceneIntegers", "SceneRomans", "SceneTitles", "Player", "Line"] ) writer.writerows(columns_data)
Editor is loading...