I have a python script that downloads, unzip and parses an XML file published by a Canadian institution. Only some very specific tags are extracted and then all put into a pandas dataframe for later processing.
Everything works well. I just wonder if there is room for improvement here, specially in the parsing part. I am not sure if the nested for
I use are a good idea or there is a better and cleaner way to parse.
import requests import zipfile import os import glob from lxml import etree from io import StringIO, BytesIO import pandas as pd import xml.etree.ElementTree as ET def download_file(url,filename): r = requests.get(url, allow_redirects=True) open(filename, 'wb').write(r.content) def unzip_and_delete(filename): zip_file = zipfile.ZipFile(filename,'r') zip_file.extractall() zip_file.close() os.remove(filename) def parse_xml_fields(file, base_tag, tag_list,final_list): root = etree.parse(file) nodes = root.findall("//{}".format(base_tag)) for node in nodes: item = {} for tag in tag_list: if node.find(".//{}".format(tag)) is not None: item[tag] = node.find(".//{}".format(tag)).text.strip() final_list.append(item) # My variables field_list = ["MsbRegistrationNumber","StatusDescriptionEnglish","Surname","GivenName","MiddleName","Name","StreetAddress"] entities_list = [] download_file('http://www10.fintrac-canafe.gc.ca/msb-esm/public/msb-search/zipdownload-eng/', 'fintrac.zip') unzip_and_delete('fintrac.zip') parse_xml_fields("MsbRegistryPublicDataFile.xml", "MsbInformation", field_list, entities_list) df = pd.DataFrame(entities_list, columns=field_list) df.to_excel("Canada_MSB_List.xlsx")