Unstructured text file read in Python1
unknown
python
3 years ago
2.8 kB
20
Indexable
# INPUT 1 (with # EXTRA ROWS tester3 tester4 / UNEVEN COLUMN):
# TITLE_RAND = 2
var1: my text zero.
var2: my text one.
var3: my text two.
var4: my text three.
tester1
tester2
tester3
tester4
# TITLE_RAND = 3
var5: my text four.
var6: my text 55.
var7: my text 6.
var8: my text 7.
tester24
tester34
# INPUT 2:
# TITLE_RAND = 2
var1: my text zero.
var2: my text one.
var3: my text two.
var4: my text three.
tester1
tester2
tester3 # extra row 1
tester4 # extra row 2
# TITLE_RAND = 3
var5: my text four.
var6: my text 55.
var7: my text 6.
var8: my text 7.
tester24
tester34
# TITLE_RAND = 4 # EXTRA COLUMN
var5: my text four.
var6: my text 55.
var7: my text 6.
var8: my text 7.
testerAA
testerAB
testerAC
testerAD
testerAE
testerAF
testerAG
testerAH
# NO ERROR BUT UNEXPECTED OUTPUT
# (tester4) https://i.imgur.com/MWhFbjp.png
# (# EXTRA COLUMN # TITLE_RAND = 4) https://i.imgur.com/WFLXpCx.png
# SCRIPT
import re
import csv
from collections import Counter
def convert_csv(filenm):
"Produces structured data by converting to CSV file"
# https://stackoverflow.com/questions/3348460/csv-file-written-with-python-has-blank-lines-between-each-row
with open(filenm, "r") as fin, open("out1.txt", "w", newline="") as csvfile:
csv_writer = csv.writer(
csvfile, delimiter=" ", quotechar='"', quoting=csv.QUOTE_MINIMAL
)
data = fin.readlines()
# Regex attern for # followed by non-digits followed by = followed by number (integer or float)
pattern = re.compile(r"# (\D+) = (\d+(?:\.\d+)?)")
# Header
cnts = Counter([m.group(1) for line in data if (m := pattern.match(line))])
frame_id = cnts.most_common(1)[0][
0
] # Most common meta data should be the frame identifier
frame_name = frame_id.split("_")[
0
] # use the part before first underscore as frame name
frames = [
f"{frame_name}_{m.group(2)}"
for line in data
if (m := pattern.match(line)) and m.group(1) == frame_id
]
# non-blank Lines that don't start with '#'
frame_vals = [
l for line in data if ((l := line.rstrip()) and not l.startswith("#"))
]
# Dividing frame data into chunks by the number of frames by column
chunksize = len(frame_vals) // len(frames)
frame_data = list(
zip(
*[
frame_vals[i : i + chunksize]
for i in range(0, len(frame_vals), chunksize)
]
)
)
csv_writer.writerow(frames) # Write header
# write other data
for row in frame_data:
csv_writer.writerow(row)
convert_csv("teste.txt")
Editor is loading...