Unstructured text file read in Python1
unknown
python
2 years ago
2.8 kB
12
Indexable
# INPUT 1 (with # EXTRA ROWS tester3 tester4 / UNEVEN COLUMN): # TITLE_RAND = 2 var1: my text zero. var2: my text one. var3: my text two. var4: my text three. tester1 tester2 tester3 tester4 # TITLE_RAND = 3 var5: my text four. var6: my text 55. var7: my text 6. var8: my text 7. tester24 tester34 # INPUT 2: # TITLE_RAND = 2 var1: my text zero. var2: my text one. var3: my text two. var4: my text three. tester1 tester2 tester3 # extra row 1 tester4 # extra row 2 # TITLE_RAND = 3 var5: my text four. var6: my text 55. var7: my text 6. var8: my text 7. tester24 tester34 # TITLE_RAND = 4 # EXTRA COLUMN var5: my text four. var6: my text 55. var7: my text 6. var8: my text 7. testerAA testerAB testerAC testerAD testerAE testerAF testerAG testerAH # NO ERROR BUT UNEXPECTED OUTPUT # (tester4) https://i.imgur.com/MWhFbjp.png # (# EXTRA COLUMN # TITLE_RAND = 4) https://i.imgur.com/WFLXpCx.png # SCRIPT import re import csv from collections import Counter def convert_csv(filenm): "Produces structured data by converting to CSV file" # https://stackoverflow.com/questions/3348460/csv-file-written-with-python-has-blank-lines-between-each-row with open(filenm, "r") as fin, open("out1.txt", "w", newline="") as csvfile: csv_writer = csv.writer( csvfile, delimiter=" ", quotechar='"', quoting=csv.QUOTE_MINIMAL ) data = fin.readlines() # Regex attern for # followed by non-digits followed by = followed by number (integer or float) pattern = re.compile(r"# (\D+) = (\d+(?:\.\d+)?)") # Header cnts = Counter([m.group(1) for line in data if (m := pattern.match(line))]) frame_id = cnts.most_common(1)[0][ 0 ] # Most common meta data should be the frame identifier frame_name = frame_id.split("_")[ 0 ] # use the part before first underscore as frame name frames = [ f"{frame_name}_{m.group(2)}" for line in data if (m := pattern.match(line)) and m.group(1) == frame_id ] # non-blank Lines that don't start with '#' frame_vals = [ l for line in data if ((l := line.rstrip()) and not l.startswith("#")) ] # Dividing frame data into chunks by the number of frames by column chunksize = len(frame_vals) // len(frames) frame_data = list( zip( *[ frame_vals[i : i + chunksize] for i in range(0, len(frame_vals), chunksize) ] ) ) csv_writer.writerow(frames) # Write header # write other data for row in frame_data: csv_writer.writerow(row) convert_csv("teste.txt")
Editor is loading...