Python function to find specific regex in the text of an XML document

Question

I'm writing a code that, starting from an XML file:

stores the index of child elements of a tag and the child elements as key, values in a dictionary (function get_xml_by_tag_names);
deletes keys whose values contain a certain string (the specific text size) and puts these keys and the corresponding values into a second dictionary (def search_delete_append);
joins, for each dictionary, the dict values and extracts their text(def main);
replaces certain values with "" (def main);
counts the occurrences of specific regex I specify (def find_regex).

The main function is problematic, as I need help cleaning it up, the regex are too many and I want to create a function for each regex inside the main function. Would it be a good option?

import re from xml.dom import minidom from xml.etree import ElementTree as ET def get_xml_by_tag_names(xml_path, tag_name_1, tag_name_2): data = {} xml_tree = minidom.parse(xml_path) item_group_nodes = xml_tree.getElementsByTagName(tag_name_1) for idx, item_group_node in enumerate(item_group_nodes): cl_compile_nodes = item_group_node.getElementsByTagName(tag_name_2) for _ in cl_compile_nodes: data[idx]=[item_group_node.toxml()] return data def find_regex(regex, text): lista = [] for x in text: matches_prima = re.findall(regex, x) lunghezza = len(matches_prima) lista.append(lunghezza) print("The number of {} matches is ".format(regex), sum(lista)) def find_regex_fasi(regex, text): matches_fasi = re.findall(regex, text) print("Numero di corpo minore è", len(matches_fasi)) def search_delete_append(dizionario, dizionariofasi): deletekeys = [] insertvalues = [] for k in dizionario: for v in dizionario[k]: if "7.489" in v: deletekeys.append(k) dizionariofasi[k] = v for item in deletekeys: del dizionario[item] def main(): dict_fasi = {} data = get_xml_by_tag_names('output2.xml', 'new_line', 'text') search_delete_append(data, dict_fasi) testo = [] for value in data.values(): myxml = ' '.join(value) tree = ET.fromstring(myxml) tmpstring = ' '.join(text.text for text in tree.findall('text')) for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"): tmpstring = tmpstring.replace(to_remove, "") testo.append(tmpstring) #testo = ''.join(testo) print(testo) find_fase_12T_leo = re.compile(r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T") #find_prima = re.compile(r"\]\s*prima(?!\S)") find_fase_base_2 = re.compile(r"\]\s([\w\s]+)\s[→]\sT") # ] parole → T find_fase_base_3 = re.compile(r"\]\s*([\w\s]+)\s*da\scui\sT") # ] parole da cui T find_fase_12 = re.compile(r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])") # ] 1 parole 2 parole (esclude T) find_fase_prima_12 = re.compile(r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])") # ] prima 1 parole 2 parole (esclude T) find_fase_prima_123 = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)") find_fase_prima_123T = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT") #prima 1 parole 2 parole 3t find_fase_prima_1freccia2 = re.compile(r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])") #] prima 1 parola → 2 parola FIND_FASE12T = re.compile(r"\]\s1\s([\w\s]+)\s2\sT") FIND_FASE123T_OPZ2 = re.compile(r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ") FIND_FASE123T = re.compile(r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT") FIND_FASE_123FRECCIAT = re.compile(r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT") FIND_FASE_1FRECCIA23T = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)") FIND_FASE_FRECCIA1F2FT = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)") FIND_FASE_PRIMA_123FRECCIAT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T") FIND_FASE_PRIMA_1FRECCIA23T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)") FIND_FASE_PRIMA_FRECCIA1F2FT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)") FIND_FASE_PRIMA_1FRECCIA2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)") FIND_FASE_PRIMA_12345T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT") FIND_FASE_PRIMA_12345T_OPZ2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)") FIND_FASE_12345T = re.compile(r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT") #find_da = re.compile(r"\]\s*da(?!\S)") #find_da_cui = re.compile(r"\]\s*([\w\s]+)\s*da\scui") #find_sps = re.compile(r"\]\s*([\w\s]+)\s*sps") #find_su = re.compile(r"\]\s*([\w\s]+)\s*su") #find_as = re.compile(r"\]\s*([\w\s]+)\s*as") #find_ins = re.compile(r"\]\s*([\w\s]+)\s*ins") #find_segue = re.compile(r"\]\s*([\w\s]+)\s*segue") find_regex(FIND_FASE12T, testo) find_regex(find_fase_12T_leo, testo) #find_regex(find_prima, testo) find_regex(find_fase_base_2, testo) find_regex(find_fase_base_3, testo) find_regex(find_fase_12, testo) find_regex(find_fase_prima_12, testo) find_regex(find_fase_prima_123, testo) find_regex(find_fase_prima_123T, testo) find_regex(find_fase_prima_1freccia2, testo) #find_regex(find_da, testo) #find_regex(find_da_cui, testo) #find_regex(find_sps, testo) #find_regex(find_su, testo) #find_regex(find_as, testo) #find_regex(find_ins, testo) #find_regex(find_segue, testo) ################# testo_fasi = [] values = [x for x in dict_fasi.values()] myxml_fasi = ' '.join(values) find_CM = re.compile(r"10\.238") find_regex_fasi(find_CM, myxml_fasi) #quanti CM ci sono? #print(myxml_fasi) for x in dict_fasi.values(): xxx= ''.join(x) tree2 = ET.fromstring(xxx) tmpstring2 = ' '.join(text.text for text in tree2.findall('text')) for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"): tmpstring2 = tmpstring2.replace(to_remove, "") testo_fasi.append(tmpstring2) #testo_fasi = ''.join(testo_fasi) print(testo_fasi) find_regex(FIND_FASE12T, testo_fasi) find_regex(FIND_FASE123T_OPZ2, testo_fasi) find_regex(FIND_FASE123T, testo_fasi) find_regex(FIND_FASE_1FRECCIA23T, testo_fasi) find_regex(FIND_FASE_123FRECCIAT, testo_fasi) find_regex(FIND_FASE_FRECCIA1F2FT, testo_fasi) find_regex(FIND_FASE_PRIMA_1FRECCIA23T, testo_fasi) find_regex(FIND_FASE_PRIMA_123FRECCIAT, testo_fasi) find_regex(FIND_FASE_PRIMA_FRECCIA1F2FT, testo_fasi) find_regex(FIND_FASE_PRIMA_1FRECCIA2, testo_fasi) find_regex(FIND_FASE_PRIMA_12345T, testo_fasi) find_regex(FIND_FASE_PRIMA_12345T_OPZ2, testo_fasi) find_regex(FIND_FASE_12345T, testo_fasi) find_regex(find_fase_12T_leo, testo_fasi) #find_regex(find_prima, testo_fasi) find_regex(find_fase_base_2, testo_fasi) find_regex(find_fase_base_3, testo_fasi) find_regex(find_fase_12, testo_fasi) find_regex(find_fase_prima_12, testo_fasi) find_regex(find_fase_prima_123, testo_fasi) find_regex(find_fase_prima_123T, testo_fasi) find_regex(find_fase_prima_1freccia2, testo_fasi) #find_regex(find_da, testo_fasi) #find_regex(find_da_cui, testo_fasi) #find_regex(find_sps, testo_fasi) #find_regex(find_su, testo_fasi) #find_regex(find_as, testo_fasi) #find_regex(find_ins, testo_fasi) #find_regex(find_segue, testo_fasi) if __name__ == "__main__": main()

I know it's half in Italian right now, but I need to keep it for now for my clarity.

Hey. Could you reduce this to a minimal working example? Perhaps include a valid minimal input and the corresponding output too? — Eman Yalpsid, CommentedMay 3, 2020 at 14:31

Eman Yalpsid · Accepted Answer · 2020-05-03 16:18:18Z

We should only use as many variables as needed

For example,

values = [x for x in dict_fasi.values()] myxml_fasi = ' '.join(values)

could be

myxml_fasi = ' '.join(dict_fasi.values())

We can reduce the number of strings created

for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"): tmpstring2 = tmpstring2.replace(to_remove, "")

could be

tmpstring2 = ''.join(c for c in tmpstring2 if c not in set("|=?-<>’(!.:,;"))

The first creates a new string with each iteration. N.b. after deleting < and >, there won't be any <> in the text.

Separating input/output from processing functions

I'd try to limit interaction with the world to as few functions as possible. For example I would not expect a function named find_regex_fasi to print anything to the console (or elsewhere). I'd make it return its results and do the printing inside main.

find_fase_12T_leo = re.compile(r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T") #find_prima = re.compile(r"\]\s*prima(?!\S)") find_fase_base_2 = re.compile(r"\]\s([\w\s]+)\s[→]\sT") # ] parole → T find_fase_base_3 = re.compile(r"\]\s*([\w\s]+)\s*da\scui\sT") # ] parole da cui T find_fase_12 = re.compile(r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])") # ] 1 parole 2 parole (esclude T) find_fase_prima_12 = re.compile(r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])") # ] prima 1 parole 2 parole (esclude T) find_fase_prima_123 = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)") find_fase_prima_123T = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT") #prima 1 parole 2 parole 3t find_fase_prima_1freccia2 = re.compile(r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])") #] prima 1 parola → 2 parola FIND_FASE12T = re.compile(r"\]\s1\s([\w\s]+)\s2\sT") FIND_FASE123T_OPZ2 = re.compile(r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ") FIND_FASE123T = re.compile(r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT") FIND_FASE_123FRECCIAT = re.compile(r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT") FIND_FASE_1FRECCIA23T = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)") FIND_FASE_FRECCIA1F2FT = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)") FIND_FASE_PRIMA_123FRECCIAT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T") FIND_FASE_PRIMA_1FRECCIA23T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)") FIND_FASE_PRIMA_FRECCIA1F2FT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)") FIND_FASE_PRIMA_1FRECCIA2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)") FIND_FASE_PRIMA_12345T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT") FIND_FASE_PRIMA_12345T_OPZ2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)") FIND_FASE_12345T = re.compile(r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT") #find_da = re.compile(r"\]\s*da(?!\S)") #find_da_cui = re.compile(r"\]\s*([\w\s]+)\s*da\scui") #find_sps = re.compile(r"\]\s*([\w\s]+)\s*sps") #find_su = re.compile(r"\]\s*([\w\s]+)\s*su") #find_as = re.compile(r"\]\s*([\w\s]+)\s*as") #find_ins = re.compile(r"\]\s*([\w\s]+)\s*ins") #find_segue = re.compile(r"\]\s*([\w\s]+)\s*segue") find_regex(FIND_FASE12T, testo) find_regex(find_fase_12T_leo, testo) #find_regex(find_prima, testo) find_regex(find_fase_base_2, testo) find_regex(find_fase_base_3, testo) find_regex(find_fase_12, testo) find_regex(find_fase_prima_12, testo) find_regex(find_fase_prima_123, testo) find_regex(find_fase_prima_123T, testo) find_regex(find_fase_prima_1freccia2, testo) #find_regex(find_da, testo) #find_regex(find_da_cui, testo) #find_regex(find_sps, testo) #find_regex(find_su, testo) #find_regex(find_as, testo) #find_regex(find_ins, testo) #find_regex(find_segue, testo)

can become something like

find_phase_regexes = { k: re.compile(v) for k, v in { "12T_leo": r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T", "prima": r"\]\s*prima(?!\S)", "base_2": r"\]\s([\w\s]+)\s[→]\sT", # ] parole → T "base_3": r"\]\s*([\w\s]+)\s*da\scui\sT", # ] parole da cui T "12": r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])", # ] 1 parole 2 parole (esclude T) "prima_12": r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])", # ] prima 1 parole 2 parole (esclude T) "prima_123": r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)", "prima_123T": r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT", #prima 1 parole 2 parole 3t "prima_1freccia2": r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])", #] prima 1 parola → 2 parola "12T": r"\]\s1\s([\w\s]+)\s2\sT", "123T_OPZ2": r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ", "123T": r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT", "123FRECCIAT": r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT", "1FRECCIA23T": r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)", "FRECCIA1F2FT": r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)", "PRIMA_123FRECCIAT": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T", "PRIMA_1FRECCIA23T": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)", "PRIMA_FRECCIA1F2FT": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)", "PRIMA_1FRECCIA2": r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)", "PRIMA_12345T": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT", "PRIMA_12345T_OPZ2": r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)", "12345T": r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT", }.items() } for k, v in find_phase_regexes.items(): find_regex(v, testo)

RootTwo · Accepted Answer · 2020-05-05 07:05:24Z

str.maketrans and str.translate

str.maketrans() is much faster than:

 for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"): tmpstring = tmpstring.replace(to_remove, "")

Instead:

# create the table once at the beginning of main or globally table = str.maketrans({c:None for c in "<>.,;-!:’?=|()"}) # then do this instead of the for-loop tmpstring = tmpstring.translate(table)

Stack Exchange Network

Python function to find specific regex in the text of an XML document

2 Answers 2

str.maketrans and str.translate

Hot Network Questions

Python function to find specific regex in the text of an XML document

2 Answers 2

str.maketrans and str.translate

Related

Hot Network Questions