scenes_columns_ints_romans_titles

 avatar
unknown
python
2 years ago
3.7 kB
6
Indexable
# importing pandas as pd
import pandas as pd

# importing re for regular expressions
import re

# importing csv for writing to csv
import csv

# importing zip_longest for iterating over rows
from itertools import zip_longest


# Getting All Column and Rows Data
df = pd.read_csv("datafolder/data.csv")
# Getting the "Line" Column
df2 = df["Line"]
# Getting the "Index" Column
idx = df["Index"]
# Creating the Df Index from the "Index" Column
df.index = idx


players = [
    "KING",
    "FERDINAND",
    "BEROWNE",
    "LONGAVILLE",
    "DUMAIN",
    "BOYET",
    "MARCADE",
    "DON ADRIANO DE ARMADO",
    "SIR NATHANIEL",
    "HOLOFERNES",
    "DULL",
    "COSTARD",
    "MOTH",
    "A FORESTER",
    "THE PRINCESS OF FRANCE",
    "ROSALINE",
    "MARIA",
    "KATHARINE",
    "JAQUENETTA",
    "LORDS",
    "ATTENDANTS",
]

### — PLAYER DATA — ###
# extract groups having the strings in the players list with ending period
players_col = df["Line"].str.extract(pat=f"^({'{0}|'.join(players)})".format("\."))
players_col.ffill(inplace=True)


### — SCENES TITLES STRINGS EXTRACT — ### "((\s|^)SCENE\s[A-Z]{1,7}\.)"
# extract groups having the strings in the scenes list with ending period
ti_scns = df["Line"].str.extract(pat="((\s|^)SCENE\s[A-Z]{1,7}\.)").ffill().fillna(0)

### — SCENES ROMAN TO INTEGER EXTRACT — "SCENE\s([A-Z]{1,7})\.$"
# extract roman numerals in the scenes strings
ro_scns = df["Line"].str.extract(pat="SCENE\s([A-Z]{1,7})\.$").ffill().fillna(0)


### Convert Roman numerals from Scene Column into integers ###
def rn_to_int(s):
    d = {"m": 1000, "d": 500, "c": 100, "l": 50, "x": 10, "v": 5, "i": 1}
    n = [d[i] for i in s.lower() if i in d]
    return sum([i if i >= n[min(j + 1, len(n) - 1)] else -i for j, i in enumerate(n)])


for numeral, expected in [["CLXIV", 164], ["MDCCLXXXIII", 1783], ["xiv", 14]]:
    assert rn_to_int(numeral) == expected


### Get the 1st column from ro_scns as an indexless series ###
# ... to only process the roman numeral without the index
# in  rn_to_int()
col_ri = []
for i in range(len(ro_scns)):
    idxless_col = ro_scns.iloc[i, 0]

    try:
        value = str(idxless_col)
    except ValueError:
        print("Input is not a string type")
        pass

    col_ri.append(rn_to_int(value))

#### convert Series to DataFrames ###
df_Idx = pd.DataFrame(df["Index"])
# df_scns = df["SceneIntegers"] # Roman to Ints
df_ri_scns = pd.DataFrame(col_ri)
#  Creating the Df Index from the "Index" Column
df_ri_scns.index = idx
# df_scn = df["SceneRomans"] # Roman strings
df_ro_scns = pd.DataFrame(ro_scns[0])
# df_scn = df["SceneTitles"] # Titles strings
df_ti_scns = pd.DataFrame(ti_scns[0])
# df_plrs = df["Player"]
df_plrs = pd.DataFrame(players_col[0])
# df_lnes = df["Line"]
df_lnes = pd.DataFrame(df["Line"])


#### create an Empty DataFrame object ####
df_cols = pd.DataFrame()

#### append columns to an empty DataFrame ####
df_cols["Index"] = df_Idx
df_cols["SceneIntegers"] = df_ri_scns
df_cols["SceneRomans"] = df_ro_scns
df_cols["SceneTitles"] = df_ti_scns
df_cols["Player"] = df_plrs
df_cols["Line"] = df_lnes

### Set
data = [
    df_cols["Index"],
    df_cols["SceneIntegers"],
    df_cols["SceneRomans"],
    df_cols["SceneTitles"],
    df_cols["Player"],
    df_cols["Line"],
]
columns_data = zip_longest(*data)

#### Create new csv output ####
with open("file.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(
        ["Index", "SceneIntegers", "SceneRomans", "SceneTitles", "Player", "Line"]
    )
    writer.writerows(columns_data)
Editor is loading...