I'm writing a code that, starting from an XML file:
- stores the index of child elements of a tag and the child elements as key, values in a dictionary (function
get_xml_by_tag_names
); - deletes keys whose values contain a certain string (the specific text size) and puts these keys and the corresponding values into a second dictionary (
def search_delete_append
); - joins, for each dictionary, the dict values and extracts their text(
def main
); - replaces certain values with "" (
def main
); - counts the occurrences of specific regex I specify (
def find_regex
).
It works, but the "main" function needs to be more cleaned up. My concerns specifically regard the part in which I have to list the regex I'm interested in- they're multiple and it can become messy. Another problem is that the cleaning of the XML can be done in another separate function, but so I haven't managed to do it.
Here is the code:
import re from xml.dom import minidom from xml.etree import ElementTree as ET from bs4 import BeautifulSoup def get_xml_by_tag_names(xml_path, tag_name_1, tag_name_2): data = {} xml_tree = minidom.parse(xml_path) item_group_nodes = xml_tree.getElementsByTagName(tag_name_1) for idx, item_group_node in enumerate(item_group_nodes): cl_compile_nodes = item_group_node.getElementsByTagName(tag_name_2) for _ in cl_compile_nodes: data[idx]=[item_group_node.toxml()] return data def find_regex(regex, text): l = [] matches_prima = re.findall(regex, text) print("The number of", {regex}," matches is ", len(matches_prima)) def search_delete_append(dizionario, dizionariofasi): deletekeys = [] insertvalues = [] for k in dizionario: for v in dizionario[k]: if "7.489" in v: deletekeys.append(k) dizionariofasi[k] = v for item in deletekeys: del dizionario[item] def main(): dict_fasi = {} data = get_xml_by_tag_names('output2.xml', 'new_line', 'text') search_delete_append(data, dict_fasi) testo = [] for value in data.values(): myxml = ' '.join(value) tree = ET.fromstring(myxml) tmpstring = ' '.join(text.text for text in tree.findall('text')) for to_remove in (" < ", " >", ".", ",", ";", "-", "!", ":", "’", "?", "<>"): tmpstring = tmpstring.replace(to_remove, "") testo.append(tmpstring) testo = ''.join(testo) #print(testo) find_prima = re.compile(r"\]\s*prima(?!\S)") #print(find_regex(find_prima, testo)) ################# testo_fasi = [] values = [x for x in dict_fasi.values()] myxml_fasi = ' '.join(values) find_CM = re.compile(r"10\.238") print(find_regex(find_CM, myxml_fasi)) #quanti CM ci sono? #print(myxml_fasi) for x in dict_fasi.values(): xxx= ''.join(x) tree2 = ET.fromstring(xxx) tmpstring2 = ' '.join(text.text for text in tree2.findall('text')) testo_fasi.append(tmpstring2) testo_fasi = ''.join(testo_fasi) print(testo_fasi) find_regex(find_prima, testo_fasi) if __name__ == "__main__": main()