#!/usr/bin/env python3; ############################################################################### # This program implements Huffman Coding to compress/decompress a file. # # Copyright © 2021 Richard Lesh. All rights reserved. ############################################################################### import Utils import sys # Store 16-bits of the int def put_int(i, ofh) : ofh.write(Utils.bitwiseAnd32((Utils.arshift32(i, 8)), 0xff).to_bytes(1, byteorder='big', signed=False)) ofh.write(Utils.bitwiseAnd32(i, 0xff).to_bytes(1, byteorder='big', signed=False)) # Read 16-bits of an int # -1 becomes 65535 def get_int(ifh) : b = Utils.getbyte(ifh) i = b i = Utils.lshift32(i, 8) b = Utils.getbyte(ifh) i = Utils.bitwiseOr32(i, b) return i # POD (plain old data) type to represent Huffman tree node. # value of -1 is an internal node, otherwise leaf node. class Node : def __init__(self, v, f) : self.value = v self.frequency = f self.left = None self.right = None def write(self, ofh) : _.put_int(self.value, ofh) FREQUENCIES = {} TREE = [] HUFFMAN_CODES = {} def bubble_sort_reverse(list) : for i in range(len(list) - 2, 0 + -1, -1) : has_swap = False for j in range(0, i + 1) : if (list[j].frequency < list[j + 1].frequency or list[j].frequency == list[j + 1].frequency and list[j].value < list[j + 1].value) : temp = list[j + 1] list[j + 1] = list[j] list[j] = temp has_swap = True if (not has_swap) : break def write_tree(node, ofh) : node.write(ofh) if (node.value == -1) : write_tree(node.left, ofh) write_tree(node.right, ofh) def read_tree(ifh) : node = Node(get_int(ifh), 0) if (node.value == 65535) : node.left = read_tree(ifh) node.right = read_tree(ifh) return node partial_byte = 0 partial_bits = 0 def put_bit(b, ofh) : global partial_byte global partial_bits partial_byte = Utils.lshift32(partial_byte, 1) if (b) : partial_byte = Utils.bitwiseOr32(partial_byte, 1) partial_bits += 1 if ((partial_bits == 8)) : ofh.write(partial_byte.to_bytes(1, byteorder='big', signed=False)) partial_byte = 0 partial_bits = 0 def put_code(c, ofh) : global HUFFMAN_CODES code = HUFFMAN_CODES[c] mask = Utils.lshift32(1, (code[1] - 1)) for i in range(0, code[1]) : b = (Utils.bitwiseAnd32(code[0], mask)) != 0 put_bit(b, ofh) mask = Utils.arshift32(mask, 1) def lookup_code(c, ofh) : global partial_byte global partial_bits global TREE partial_byte = Utils.lshift32(partial_byte, 8) partial_byte = Utils.bitwiseOr32(partial_byte, c) partial_bits += 8 while (partial_bits > 0) : mask = Utils.lshift32(1, (partial_bits - 1)) node = TREE[0] output_path = 0 bits_used = 0 while (node.value == 65535) : if (mask == 0) : return output_path = Utils.lshift32(output_path, 1) if ((Utils.bitwiseAnd32(partial_byte, mask)) == 0) : node = node.left else : node = node.right output_path = Utils.bitwiseOr32(output_path, 1) bits_used += 1 mask = Utils.arshift32(mask, 1) ofh.write(node.value.to_bytes(1, byteorder='big', signed=False)) partial_bits -= bits_used partial_byte = Utils.bitwiseAnd32(partial_byte, Utils.complement32((Utils.lshift32(output_path, partial_bits)))) def construct_huffman_codes(node, code) : global HUFFMAN_CODES if (node.value == -1) : new_code = (Utils.lshift32(code[0], 1), code[1] + 1) construct_huffman_codes(node.left, new_code) new_code = (Utils.bitwiseOr32((Utils.lshift32(code[0], 1)), 1), code[1] + 1) construct_huffman_codes(node.right, new_code) else : HUFFMAN_CODES[node.value] = code def build_tree() : global FREQUENCIES global TREE for k in FREQUENCIES.keys() : TREE.append(Node(k, FREQUENCIES[k])) tree_size = len(TREE) while (tree_size > 1) : bubble_sort_reverse(TREE) new_node = Node(-1, 0) new_node.right = TREE.pop() new_node.left = TREE.pop() new_node.frequency = new_node.left.frequency + new_node.right.frequency TREE.append(new_node) tree_size = len(TREE) def compute_frequencies(from_filespec) : global FREQUENCIES ifh = open(from_filespec, "rb") c = Utils.getbyte(ifh) while c != None : if (not (c in FREQUENCIES.keys())) : FREQUENCIES[c] = 0 FREQUENCIES[c] += 1 c = Utils.getbyte(ifh) ifh.close() def compress_file(from_filespec, to_filespec) : global TREE ofh = open(to_filespec, "wb") write_tree(TREE[0], ofh) ifh = open(from_filespec, "rb") c = Utils.getbyte(ifh) while c != None : put_code(c, ofh) c = Utils.getbyte(ifh) for i in range(0, 7) : put_bit(False, ofh) ofh.close() ifh.close() def decompress_file(from_filespec, to_filespec) : global TREE ifh = open(from_filespec, "rb") ofh = open(to_filespec, "wb") TREE = [read_tree(ifh)] c = Utils.getbyte(ifh) while c != None : lookup_code(c, ofh) c = Utils.getbyte(ifh) ofh.close() ifh.close() # Begin Main if (len(sys.argv) != 4) : print("Syntax: " + sys.argv[0] + " {fromFilespec} {toFilespec} {C|D}") sys.exit(1) from_filespec = sys.argv[1] to_filespec = sys.argv[2] mode = sys.argv[3] try : if (mode == "C") : compute_frequencies(from_filespec) build_tree() start_code = (0, 0) construct_huffman_codes(TREE[0], start_code) compress_file(from_filespec, to_filespec) elif (mode == "D") : decompress_file(from_filespec, to_filespec) else : print("Bad mode. Only C|D allowed.") except OSError as ex : print("Error: " + str(ex))