I attempted to implement a python version of this function in my previous question. Given a regular expression with range(s) of alphabets/numbers, the function will expand the brackets and return a list of strings that explicitly spell out each match. If the input regex has multiple expressions separated by |
, the output will be a dictionary where the keys are input expressions.
Within regex_expander
, I created a sub-function single_expander
to handle each expression separated by |
. Would it be a better practice to separate this function out? Besides this, I am also looking for any advice on style, efficiency, any room for improvements.
Code
import re import itertools import warnings def regex_expander(rex, verbose=True): """ Given a regex with ranges (e.g. "as[1-9]df"), returns a list of strings that expands the bracket and explicitly spells out each match. If input regex contains multiple ranges separated by "|", returns a dictionary of converted strings where the keys are input expressions. args: - rex: regular expression with a range to expand - verbose: if True, will print verbose output """ alphabets = "abcdefghijklmnopqrstuvwxyz" ALPHA_NUMS = alphabets.upper() + alphabets + "0123456789" def single_expander(rex, verbose): # extract ranges range_patterns = re.findall(r"\[.*?\]", rex) # replace ranges if len(range_patterns) == 1: range_pattern = range_patterns[0] expanded_range = re.findall(range_pattern, ALPHA_NUMS) replaced = [rex.replace(range_pattern, x) for x in expanded_range] elif len(range_patterns) > 1: expanded_range = [re.findall(rng, ALPHA_NUMS) for rng in range_patterns] expanded_range_prod = list(itertools.product(*expanded_range)) replaced = [] for tup in expanded_range_prod: range_dict = {k: v for k, v in zip(range_patterns, tup)} rex_copy = rex for k, v in range_dict.items(): rex_copy = rex_copy.replace(k, v) replaced.append(rex_copy) # for verbose output expanded_range = ["".join(tup) for tup in expanded_range_prod] else: replaced = rex warnings.warn(f"The input expression {rex} does not contain any ranges.") return replaced if verbose: print("original string:", rex) print("expanded range\treplaced string") for e, r in zip(expanded_range, replaced): print(e.rjust(len("expanded range")), r.rjust(len("replaced string")), sep="\t") return replaced ### rex_split = rex.split("|") if len(rex_split) == 1: return single_expander(rex, verbose) else: return {r: single_expander(r, verbose) for r in rex_split}
Examples
r = "02[W04]F[0-4][JK]Z" regex_expander(r, verbose=False) # output ['02WF0JZ', '02WF0KZ', '02WF1JZ', '02WF1KZ', '02WF2JZ', '02WF2KZ', '02WF3JZ', '02WF3KZ', '02WF4JZ', '02WF4KZ', '020F0JZ', '020F0KZ', '020F1JZ', '020F1KZ', '020F2JZ', '020F2KZ', '020F3JZ', '020F3KZ', '020F4JZ', '020F4KZ', '024F0JZ', '024F0KZ', '024F1JZ', '024F1KZ', '024F2JZ', '024F2KZ', '024F3JZ', '024F3KZ', '024F4JZ', '024F4KZ']
r = "W3812|405[0-3L-O]|02[W04]F[0-4][JK]Z" regex_expander(r, verbose=False) # output UserWarning: The input expression W3812 does not contain any ranges. warnings.warn(f"The input expression {rex} does not contain any ranges.") {'W3812': 'W3812', '405[0-3L-O]': ['405L', '405M', '405N', '405O', '4050', '4051', '4052', '4053'], '02[W04]F[0-4][JK]Z': ['02WF0JZ', '02WF0KZ', '02WF1JZ', '02WF1KZ', '02WF2JZ', '02WF2KZ', '02WF3JZ', '02WF3KZ', '02WF4JZ', '02WF4KZ', '020F0JZ', '020F0KZ', '020F1JZ', '020F1KZ', '020F2JZ', '020F2KZ', '020F3JZ', '020F3KZ', '020F4JZ', '020F4KZ', '024F0JZ', '024F0KZ', '024F1JZ', '024F1KZ', '024F2JZ', '024F2KZ', '024F3JZ', '024F3KZ', '024F4JZ', '024F4KZ']}