I'm writing a code that, starting from an XML file:
- stores the index of child elements of a tag and the child elements as key, values in a dictionary (function
get_xml_by_tag_names
); - deletes keys whose values contain a certain string (the specific text size) and puts these keys and the corresponding values into a second dictionary (def
search_delete_append
); - joins, for each dictionary, the dict values and extracts their text(def
main
); - replaces certain values with "" (def
main
); - counts the occurrences of specific regex I specify (def
find_regex
).
The main
function is problematic, as I need help cleaning it up, the regex are too many and I want to create a function for each regex inside the main function. Would it be a good option?
import re from xml.dom import minidom from xml.etree import ElementTree as ET def get_xml_by_tag_names(xml_path, tag_name_1, tag_name_2): data = {} xml_tree = minidom.parse(xml_path) item_group_nodes = xml_tree.getElementsByTagName(tag_name_1) for idx, item_group_node in enumerate(item_group_nodes): cl_compile_nodes = item_group_node.getElementsByTagName(tag_name_2) for _ in cl_compile_nodes: data[idx]=[item_group_node.toxml()] return data def find_regex(regex, text): lista = [] for x in text: matches_prima = re.findall(regex, x) lunghezza = len(matches_prima) lista.append(lunghezza) print("The number of {} matches is ".format(regex), sum(lista)) def find_regex_fasi(regex, text): matches_fasi = re.findall(regex, text) print("Numero di corpo minore è", len(matches_fasi)) def search_delete_append(dizionario, dizionariofasi): deletekeys = [] insertvalues = [] for k in dizionario: for v in dizionario[k]: if "7.489" in v: deletekeys.append(k) dizionariofasi[k] = v for item in deletekeys: del dizionario[item] def main(): dict_fasi = {} data = get_xml_by_tag_names('output2.xml', 'new_line', 'text') search_delete_append(data, dict_fasi) testo = [] for value in data.values(): myxml = ' '.join(value) tree = ET.fromstring(myxml) tmpstring = ' '.join(text.text for text in tree.findall('text')) for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"): tmpstring = tmpstring.replace(to_remove, "") testo.append(tmpstring) #testo = ''.join(testo) print(testo) find_fase_12T_leo = re.compile(r"\]\s*AN\s*1\s*([\w\s]+)da\s*cui\s*2\s*([\w\s]+)da\s*cui\s*T") #find_prima = re.compile(r"\]\s*prima(?!\S)") find_fase_base_2 = re.compile(r"\]\s([\w\s]+)\s[→]\sT") # ] parole → T find_fase_base_3 = re.compile(r"\]\s*([\w\s]+)\s*da\scui\sT") # ] parole da cui T find_fase_12 = re.compile(r"\]\s1\s([\w\s]+)\s2\s([\w\s]+[^T])") # ] 1 parole 2 parole (esclude T) find_fase_prima_12 = re.compile(r"\]\s+prima\s+1\s+([\w\s]+)\s+2([\w\s]+[^T])") # ] prima 1 parole 2 parole (esclude T) find_fase_prima_123 = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3([\w\s]+)") find_fase_prima_123T = re.compile(r"\]\sprima\s1\s([\w\s]+)\s2([\w\s]+)\s3\sT") #prima 1 parole 2 parole 3t find_fase_prima_1freccia2 = re.compile(r"\]\s+prima\s1\s([\w\s]+)\s[→]\s2([\w\s]+[^T])") #] prima 1 parola → 2 parola FIND_FASE12T = re.compile(r"\]\s1\s([\w\s]+)\s2\sT") FIND_FASE123T_OPZ2 = re.compile(r"\]\s*prima\s*1([\w\s]+)\s*2([\w\s][^3|^3T]+) ") FIND_FASE123T = re.compile(r"\]\s*1([\w\s]+)\s*2([\w\s]+)\s3\sT") FIND_FASE_123FRECCIAT = re.compile(r"\]\s1\s([\w\s]+)\s2([\w\s]+)\s→\sT") FIND_FASE_1FRECCIA23T = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s(T|3\sT)") FIND_FASE_FRECCIA1F2FT = re.compile(r"\]\s1\s([\w\s]+)\s→\s2([\w\s]+)\s→\s(T|3\sT)") FIND_FASE_PRIMA_123FRECCIAT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*→\s*T") FIND_FASE_PRIMA_1FRECCIA23T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*(T|3\sT)") FIND_FASE_PRIMA_FRECCIA1F2FT = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)\s*→\s*(T|3\sT)") FIND_FASE_PRIMA_1FRECCIA2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*→\s*2([\w\s]+)") FIND_FASE_PRIMA_12345T = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT") FIND_FASE_PRIMA_12345T_OPZ2 = re.compile(r"\]\s*prima\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s][^5|^5\sT]+)") FIND_FASE_12345T = re.compile(r"\]\s*1\s*([\w\s]+)\s*2([\w\s]+)\s*3([\w\s]+)\s*4([\w\s]+)\s*5\sT") #find_da = re.compile(r"\]\s*da(?!\S)") #find_da_cui = re.compile(r"\]\s*([\w\s]+)\s*da\scui") #find_sps = re.compile(r"\]\s*([\w\s]+)\s*sps") #find_su = re.compile(r"\]\s*([\w\s]+)\s*su") #find_as = re.compile(r"\]\s*([\w\s]+)\s*as") #find_ins = re.compile(r"\]\s*([\w\s]+)\s*ins") #find_segue = re.compile(r"\]\s*([\w\s]+)\s*segue") find_regex(FIND_FASE12T, testo) find_regex(find_fase_12T_leo, testo) #find_regex(find_prima, testo) find_regex(find_fase_base_2, testo) find_regex(find_fase_base_3, testo) find_regex(find_fase_12, testo) find_regex(find_fase_prima_12, testo) find_regex(find_fase_prima_123, testo) find_regex(find_fase_prima_123T, testo) find_regex(find_fase_prima_1freccia2, testo) #find_regex(find_da, testo) #find_regex(find_da_cui, testo) #find_regex(find_sps, testo) #find_regex(find_su, testo) #find_regex(find_as, testo) #find_regex(find_ins, testo) #find_regex(find_segue, testo) ################# testo_fasi = [] values = [x for x in dict_fasi.values()] myxml_fasi = ' '.join(values) find_CM = re.compile(r"10\.238") find_regex_fasi(find_CM, myxml_fasi) #quanti CM ci sono? #print(myxml_fasi) for x in dict_fasi.values(): xxx= ''.join(x) tree2 = ET.fromstring(xxx) tmpstring2 = ' '.join(text.text for text in tree2.findall('text')) for to_remove in ("<", ">", ".", ",", ";", "-", "!", ":", "’", "?", "<>", "=", "|", "(", ")"): tmpstring2 = tmpstring2.replace(to_remove, "") testo_fasi.append(tmpstring2) #testo_fasi = ''.join(testo_fasi) print(testo_fasi) find_regex(FIND_FASE12T, testo_fasi) find_regex(FIND_FASE123T_OPZ2, testo_fasi) find_regex(FIND_FASE123T, testo_fasi) find_regex(FIND_FASE_1FRECCIA23T, testo_fasi) find_regex(FIND_FASE_123FRECCIAT, testo_fasi) find_regex(FIND_FASE_FRECCIA1F2FT, testo_fasi) find_regex(FIND_FASE_PRIMA_1FRECCIA23T, testo_fasi) find_regex(FIND_FASE_PRIMA_123FRECCIAT, testo_fasi) find_regex(FIND_FASE_PRIMA_FRECCIA1F2FT, testo_fasi) find_regex(FIND_FASE_PRIMA_1FRECCIA2, testo_fasi) find_regex(FIND_FASE_PRIMA_12345T, testo_fasi) find_regex(FIND_FASE_PRIMA_12345T_OPZ2, testo_fasi) find_regex(FIND_FASE_12345T, testo_fasi) find_regex(find_fase_12T_leo, testo_fasi) #find_regex(find_prima, testo_fasi) find_regex(find_fase_base_2, testo_fasi) find_regex(find_fase_base_3, testo_fasi) find_regex(find_fase_12, testo_fasi) find_regex(find_fase_prima_12, testo_fasi) find_regex(find_fase_prima_123, testo_fasi) find_regex(find_fase_prima_123T, testo_fasi) find_regex(find_fase_prima_1freccia2, testo_fasi) #find_regex(find_da, testo_fasi) #find_regex(find_da_cui, testo_fasi) #find_regex(find_sps, testo_fasi) #find_regex(find_su, testo_fasi) #find_regex(find_as, testo_fasi) #find_regex(find_ins, testo_fasi) #find_regex(find_segue, testo_fasi) if __name__ == "__main__": main()
I know it's half in Italian right now, but I need to keep it for now for my clarity.