Unstructured text file read in Python1

 avatar
unknown
python
2 years ago
2.8 kB
12
Indexable
# INPUT 1 (with # EXTRA ROWS tester3 tester4 / UNEVEN COLUMN):


# TITLE_RAND = 2
var1: my text zero.
var2: my text one.
var3: my text two.
var4: my text three.
tester1
tester2
tester3
tester4
# TITLE_RAND = 3
var5: my text four.
var6: my text 55.
var7: my text 6.
var8: my text 7.
tester24
tester34




# INPUT 2:


# TITLE_RAND = 2
var1: my text zero.
var2: my text one.
var3: my text two.
var4: my text three.
tester1
tester2
tester3                 # extra row 1
tester4                 # extra row 2
# TITLE_RAND = 3
var5: my text four.
var6: my text 55.
var7: my text 6.
var8: my text 7.
tester24
tester34
# TITLE_RAND = 4        # EXTRA COLUMN
var5: my text four.
var6: my text 55.
var7: my text 6.
var8: my text 7.
testerAA
testerAB
testerAC
testerAD
testerAE
testerAF
testerAG
testerAH



# NO ERROR BUT UNEXPECTED OUTPUT


# (tester4)                             https://i.imgur.com/MWhFbjp.png

# (# EXTRA COLUMN # TITLE_RAND = 4)     https://i.imgur.com/WFLXpCx.png

# SCRIPT

import re
import csv
from collections import Counter


def convert_csv(filenm):
    "Produces structured data by converting to CSV file"

    # https://stackoverflow.com/questions/3348460/csv-file-written-with-python-has-blank-lines-between-each-row
    with open(filenm, "r") as fin, open("out1.txt", "w", newline="") as csvfile:
        csv_writer = csv.writer(
            csvfile, delimiter=" ", quotechar='"', quoting=csv.QUOTE_MINIMAL
        )
        data = fin.readlines()

        # Regex attern for # followed by non-digits followed by = followed by number (integer or float)
        pattern = re.compile(r"# (\D+) = (\d+(?:\.\d+)?)")

        # Header
        cnts = Counter([m.group(1) for line in data if (m := pattern.match(line))])
        frame_id = cnts.most_common(1)[0][
            0
        ]  # Most common meta data should be the frame identifier
        frame_name = frame_id.split("_")[
            0
        ]  # use the part before first underscore as frame name

        frames = [
            f"{frame_name}_{m.group(2)}"
            for line in data
            if (m := pattern.match(line)) and m.group(1) == frame_id
        ]

        # non-blank Lines that don't start with '#'
        frame_vals = [
            l for line in data if ((l := line.rstrip()) and not l.startswith("#"))
        ]

        # Dividing frame data into chunks by the number of frames by column
        chunksize = len(frame_vals) // len(frames)
        frame_data = list(
            zip(
                *[
                    frame_vals[i : i + chunksize]
                    for i in range(0, len(frame_vals), chunksize)
                ]
            )
        )

        csv_writer.writerow(frames)  # Write header

        # write other data
        for row in frame_data:
            csv_writer.writerow(row)


convert_csv("teste.txt")

Editor is loading...