I've implemented a minimal external sort of text file using heapq python module.
On the few tests I did it seems to works well, but I would like to have some advice to have a cleaner and faster code. I do not know much of good practices and I want to learn (May wish to go from academics to industry one day). All remarks, advice and suggestions are warmly welcome.
There are 3 functions: one that splits the big file in smaller files, one that does the merge, and one main function.
import os import tempfile import heapq import sys import shutil # Algorithm based on # https://github.com/melvilgit/external-Merge-Sort def split_large_file(starting_file, my_temp_dir, max_line=1000000): """ :param starting_file: input file to be splitted :param my_temp_dir: temporary directory :param max_line: number of line to put in each smaller file (ram usage) :return: a list with all TemporaryFile """ liste_file = [] line_holder = [] cpt = 0 with open(starting_file, 'rb') as f_in: for line in f_in: line_holder.append(line) cpt += 1 if cpt % max_line == 0: cpt = 0 line_holder.sort(key=lambda x: x.split(b"\t")[0]) temp_file = tempfile.NamedTemporaryFile(dir=my_temp_dir, delete=False) temp_file.writelines(line_holder) temp_file.seek(0) line_holder = [] liste_file.append(temp_file) if line_holder: line_holder.sort(key=lambda x: x.split(b"\t")[0]) temp_file = tempfile.NamedTemporaryFile(dir=my_temp_dir, delete=False) temp_file.writelines(line_holder) temp_file.seek(0) liste_file.append(temp_file) return liste_file def merged(liste_file, out_file, col): """ :param liste_file: a list with all temporary file opened :param out_file: the output file :param col: the column where to perform the sort, being minimal the script will fail if one column is shorter than this value :return: path to output file """ my_heap = [] for elem in liste_file: line = elem.readline() spt = line.split(b"\t") heapq.heappush(my_heap, [int.from_bytes(spt[col], "big"), line, elem]) with open(out_file, "wb") as out: while True: minimal = my_heap[0] if minimal[0] == sys.maxsize: break out.write(minimal[1]) file_temp = minimal[2] line = file_temp.readline() if not line: my_heap[0] = [sys.maxsize, None, None] os.remove(file_temp.name) else: spt = line.split(b"\t") my_heap[0] = [int.from_bytes(spt[col], "big"), line, file_temp] heapq.heapify(my_heap) return out_file def main(big_file, outfile, tmp_dir=None, max_line=1000000, column=0): if not tmp_dir: tmp_dir = os.getcwd() with tempfile.TemporaryDirectory(dir=tmp_dir) as my_temp_dir: temp_dir_file_list = split_large_file(big_file, my_temp_dir, max_line) print("splitted") merged(liste_file=temp_dir_file_list, out_file=outfile, col=column) print("file merged, sorting done")