I have a Python script that given some configuration, generates a random CSV file that can be used for testing purposes.
I want to know if it adheres to the best Python and coding practices. It works for cases where I need upwards of 10K+ rows fast enough for my requirements, so I am not too worried about performance although inputs on performance are also appreciated.
Input:
- Schema: as a dict, information about each column name, data type and some other constraints (like fixed length/in a range/ from a given list)
- Number of rows
- Name of the output CSV file
Script:
import random as rnd import csv from abc import ABC, abstractmethod # csv creator, creates a csv files with a given config roundPrecision = 3 class BoundType(ABC): def __init__(self, dtype, params): self.dType = dtype self.params = params @abstractmethod def generate(self): pass class FixedLength(BoundType): # params is length def generate(self): length = self.params.get("len", 1) if self.dType == "int": return rnd.randint(10 ** (length - 1), 10 ** length - 1) elif self.dType == "float": return FixedLength("int", self.params).generate() + round(rnd.random(), roundPrecision) elif self.dType == "string": alphabet = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ") word = [rnd.choice(alphabet) for _ in range(length)] return ''.join(word) else: return None class FixedRange(BoundType): # params is range def generate(self): lo, hi = (self.params.get("lohi")) if self.dType == "int": return rnd.randint(lo, hi) elif self.dType == "float": return round(rnd.uniform(lo, hi), roundPrecision) else: return None class FromPossibleValues(BoundType): # params is a list def generate(self): possibleval = self.params.get("set", set()) return rnd.choice(possibleval) def createcsv(rows, filename, schema): with open(f'./output/{filename}.csv', 'w', encoding='UTF8', newline='') as f: writer = csv.writer(f) writer.writerow(schema.keys()) for _ in range(rows): writer.writerow([x.generate() for x in schema.values()])
Test:
from csvGen.csvGenerator import FixedLength, FixedRange, FromPossibleValues, createcsv schema = { "col1": FixedLength("int", {"len": 5}), "col2": FixedLength("float", {"len": 5}), "col3": FixedLength("string", {"len": 5}), "col4": FixedRange("int", {"lohi": (10, 15)}), "col5": FixedRange("float", {"lohi": (5.5, 6.7)}), "col6": FromPossibleValues("int", {"set": [1, 2, 3, 4, 5]}), "col7": FromPossibleValues("int", {"set": [1.1, 2.2, 3.3]}), "col8": FromPossibleValues("int", {"set": ["A", "AB"]}) } rows = 10 fileName = "eightVals" createcsv(rows, fileName, schema)
This is what the output looks like for the given test :
col1 | col2 | col3 | col4 | col5 | col6 | col7 | col8 |
---|---|---|---|---|---|---|---|
51685 | 71830.471 | PAXBK | 12 | 6.192 | 1 | 2.2 | AB |
60384 | 42341.991 | RHNUK | 11 | 6.037 | 1 | 1.1 | AB |
73505 | 30997.171 | DVOGT | 10 | 6.69 | 5 | 2.2 | A |
60528 | 85072.731 | FWWXW | 10 | 5.761 | 1 | 2.2 | A |
23048 | 65401.245 | EVPUX | 13 | 6.474 | 4 | 1.1 | AB |
74748 | 66969.774 | PEULP | 15 | 6.546 | 3 | 2.2 | AB |
88763 | 34749.184 | VOAUO | 10 | 6.402 | 4 | 2.2 | AB |
77351 | 44566.163 | JOBQF | 13 | 5.683 | 1 | 2.2 | AB |
50820 | 73002.154 | EACZT | 15 | 5.711 | 1 | 1.1 | AB |
53037 | 89225.572 | YTLBI | 13 | 6.328 | 1 | 2.2 | AB |