Assignment09.py

# CSCI 355 Web Technology
#Summer 2024
#Assignment 9:   “Data Compression”



#ACK - Class



import heapq

# [1] Define functions that implement each of the four data compression algorithms:
#
# dc_run_length_encoding()
# dc_huffman_coding()
# dc_lempel_ziv_welch()
# dc_arithmetic_encoding()
def dc_run_length_encoding(bits):
    print("DC Run Length Encoding")
    print(f"Bits: {bits}")
    n = len(bits)
    count = 0
    res = ""
    for i in range(n):
        if bits[i] == "0":
            count += 1
        else:
            res += f"{count} "
            count = 0
    if count > 0:
        res += f"{count}"
    print("Result: ", res)
    return res

class node:
    def __init__(self, freq, symbol, left=None, right=None):
        # frequency of symbol
        self.freq = freq

        # symbol name (character)
        self.symbol = symbol

        # node left of current node
        self.left = left

        # node right of current node
        self.right = right

        # tree direction (0/1)
        self.huff = ''

    def __lt__(self, nxt):
        return self.freq < nxt.freq

    # utility function to print huffman


# codes for all symbols in the newly
# created Huffman tree
def printNodes(dict_huffman, node, val='',):
    # huffman code for current node
    newVal = val + str(node.huff)

    # if node is not an edge node
    # then traverse inside it
    if (node.left):
        printNodes(dict_huffman, node.left, newVal)
    if (node.right):
        printNodes(dict_huffman, node.right, newVal)

        # if node is edge node then
        # display its huffman code
    if (not node.left and not node.right):
        dict_huffman[node.symbol] = newVal
        print(f"{node.symbol} -> {newVal}")

    # characters for huffman tree


def dc_huffman_coding(message, alphabet, frequency):

    # list containing unused nodes
    nodes = []

    # converting characters and frequencies
    # into huffman tree nodes
    for i in range(len(alphabet)):
        heapq.heappush(nodes, node(frequency[i], alphabet[i]))

    while len(nodes) > 1:
        # sort all the nodes in ascending order
        # based on their frequency
        left = heapq.heappop(nodes)
        right = heapq.heappop(nodes)

        # assign directional value to these nodes
        left.huff = 0
        right.huff = 1

        # combine the 2 smallest nodes to create
        # new node as their parent
        newNode = node(left.freq + right.freq, left.symbol + right.symbol, left, right)

        heapq.heappush(nodes, newNode)

    # Huffman Tree is ready!
    huffman_dict = {}
    printNodes(huffman_dict, nodes[0])

    print(f"Huffman Dictionary: {huffman_dict}")

    encoded_message ="".join([huffman_dict[letter] for letter in message])

    print(f"Encoded Message: {encoded_message}")

    len_uncompressed = len(message)
    len_compressed = len(encoded_message)
    print(f"Uncompressed Length: {len_uncompressed}")
    print(f"Compressed Length: {len_compressed}")
    compression_rate = round(len_compressed / len_uncompressed, 2)
    print(f"Compression Rate: {compression_rate}")

def dc_lempel_ziv_welch(message, alphabet):
    """Compress a string to a list of output symbols."""

    # Build the dictionary.
    dict_size = len(alphabet)
    dictionary = {alphabet[i]: i for i in range(dict_size)}

    w = ""
    result = []
    for c in message:
        wc = w + c
        if wc in dictionary:
            w = wc
        else:
            result.append(dictionary[w])
            # Add wc to the dictionary.
            dictionary[wc] = dict_size
            dict_size += 1
            w = c

    # Output the code for w.
    if w:
        result.append(dictionary[w])
    return result

def dc_arithmetic_encoding(message, chars, freqs):
    # see figure 8.7 on slide 8.2
    terminating = chars[-1]
    cum_freqs = compute_cum_freqs(freqs)

    encode(message, chars, freqs, cum_freqs)

def compute_cum_freqs(freqs):
    cum_freqs = [0]
    cum_freq = 0
    for i, freq in enumerate(freqs):
        if i == len(freqs) - 1:
            break
        cum_freq += freq
        cum_freqs.append(cum_freq)
    print(cum_freqs)
    return cum_freqs

def encode(message, chars, freqs, cum_freqs):
    interval0 = 0
    interval1 = 1
    for char in message:
        idx = chars.index(char)
        freq = freqs[idx]
        cum_freq = cum_freqs[idx]
        interval_size = interval1 - interval0
        interval0 += interval_size * cum_freq
        interval1 = interval0 + freq * interval_size
    return interval_to_binary(interval0, interval1)

def interval_to_binary(interval_start, interval_finish):
    bin_str = ""
    prob = 0
    index = 0
    while prob < interval_start:
        index += 1
        power = 1 / (2 ** index)
        if prob + power <= interval_finish:
            bin_str += "1"
            prob += power
        else:
            bin_str += "0"
    print("prob", prob)
    print("bin_string", bin_str)
    return bin_str


def main():
    print("RLE")
    bits = "0" * 12 + "1" + "0" * 3 + "1" + "0" * 0 + "1" + "0" * 8
    print("Bits: ", bits)
    dc_run_length_encoding(bits)
    message = "BAABABBBAABBBBAA"
    alphabet = ['A', 'B']
    LZW_compressed = dc_lempel_ziv_welch(message, alphabet)
    print("LZW")
    print(f"Message: {message}")
    print("LZW Compressed: ", LZW_compressed)

    print()
    print("Huffman")
    huffman_message = "EAEBAECDDEA"
    huffman_alphabet = ['A', 'B', 'C', 'D', 'E']
    huffman_frequencies = [20, 10, 10, 30 , 30]

    print(f"Huffman Message: {huffman_message}")
    print(f"Huffman Alphabet: {huffman_alphabet}")
    print(f"Huffman Frequencies: {huffman_frequencies}")

    dc_huffman_coding(huffman_message, huffman_alphabet, huffman_frequencies)

    message = "BBAB*"
    chars = ["A", "B", "*"]
    freqs = [.4, .5, .1]

    print()
    print("Arithmetic Encoding")
    print(f"Message: {message}")
    print(f"Chars: {chars}")
    print(f"Freqs: {freqs}")

    dc_arithmetic_encoding(message, chars, freqs)

if __name__ == "__main__":
    main()
Editor is loading...