Python script to read an XML file and write some information to an Excel file

Question

I'm getting an XML file from an application, want to extract some information from that file and write it to an Excel file. The XML file contains data about vacations. I need data from the last month with an approval status 1, 5 or 6.

The following function returns the month and year:

 def set_month(): year = time.localtime()[0] month = time.localtime()[1] if month == 1: month = 12 year -= 1 else: month -= 1 return (year, month)

I assume there is a better way to do this.

Below you'll find the complete script and an example XML file. I'd be happy about any ideas to improve the code.

#!/usr/bin/env python3 # file: urlaub.py # desc: Reads a xml file created from another software. This file contains # vacation requests. This program search all vacation requests for the month # ago, grabs the relevant informations and write they into an excel file. # # possible exit status if the program does not crash: # 0: all clear # 1: you hav a double character in your wantedInfos directory # 2: failed to read the xml file or to get the tree root # 3: failed to write the excel file import time import logging from sys import exit import datetime as dt from openpyxl import Workbook as WB from openpyxl.styles import Font import xml.etree.ElementTree as ET # --- logging --- # def configure_logging(loglevel): ''' Configure logging settings. param 1: integer ''' formatstring = "%(levelname)-8s %(message)s" logging.basicConfig(format=formatstring, level=loglevel) logging.info("Loglevel set to {}".format(loglevel)) # --- misk --- # def check_doubles(): ''' Checks for double (column) characters in the wantedInfos dictionary. ''' char_list = [] logging.debug("Check for double cell characters") for dictionary in (wantedInfos, interestedAbsence): for key in dictionary.keys(): character = dictionary[key]["Col"] if character in char_list: logging.error("Double cell character found: {}".format(character)) exit(1) char_list.append(character) logging.debug("Character list {}".format(sorted(char_list))) # --- xml processing --- # def set_month(): ''' Searchs for the month ago. Returns year and month as string. returns: tuple of strings ''' year = time.localtime()[0] month = time.localtime()[1] if month == 1: month = 12 year -= 1 else: month -= 1 logging.info("Search for vacation requests in {} {}".format(month, year)) return (year, month) def absence_is_relevant(absence): ''' Checks if the given absence is relevant. This includes an check for time and also for approval status. param 1: xml element returns: boolean ''' start = absence.find(get_path(["Beginning"])) date = start.text.split("T") logging.debug("Test absence for {}".format(date)) year, month, day = date[0].split("-") if int(year) == wantedYear and int(month) == wantedMonth: logging.debug("Absence is relevant") status = absence.find(get_path(["ApprovalStatus"])) if int(status.text) in (0, 2, 4): logging.debug("Status is {} ... skip.".format(status.text)) return False return True logging.debug("Absence is not relevant") return False def request_is_relevant(request): ''' Checks if the given request is relevant. Walks through the absence list and calls absence_is_relevant for every absence found. param 1: xml tree element returns: boolean ''' guid_main = request.find(get_path(["GUID"])) name = request.find(get_path(["RequestorFullName"])) logging.debug("Test for relevanz: {}".format(guid_main.text)) for absence in request.findall(get_path(["AbsenceList", "Absence"])): if absence_is_relevant(absence) is True: logging.debug("Request is relevant") return True logging.debug("Request is not relevant") return False def get_path(part_list): ''' Because i cannot access the data only with the tag name, we add the namespace for every tree level. param 1: list returns: string ''' parts = [] for tag in part_list: parts.append(get_full_tag(tag)) path = "/".join(parts) return path def get_full_tag(tag): ''' Adds the namespace to an single xml tag. param 1: string returns: string ''' return "".join((ns, tag)) def crop_namespace(node): ''' param 1: string returns: string or none ''' if node is None: return None return node.tag.split("}")[1].strip() def process_request(request): ''' Processes the given request. We walk two times through all keys from the wantedInfos dictionary. First time we collect all data for keys flagged with "Abs" is false. Second time we walk through all branches from absence list. If the absence is relevant, we collect all data for keys flagged with true. These data will be stored in a subdictionary with absenceCounter as key. param 1: xml-tree object returns: dictionary ''' currentRequest: dict = {'AbsenceList': {}, } relevantCounter = 0 absenceCounter = 0 guid_main = request.find(get_path(["GUID"])) name = request.find(get_path(["RequestorFullName"])) logging.debug("Processing {}: {}".format(guid_main.text, name.text)) # collect request level information for key in wantedInfos.keys(): if wantedInfos[key]['Abs'] is False: path = get_path(wantedInfos[key]["Path"]) logging.debug("Search for key '{}' '{}'".format(key, path)) node = request.find(path) logging.debug("Found node '{}'".format(crop_namespace(node))) if node is None: value = None else: value = node.text currentRequest[key] = value logging.debug("Store: key: {}, value: '{}'".format(key, value)) # walk through absence list and collect absence level informations for absence in request.findall(get_path(["AbsenceList", "Absence"])): absenceCounter += 1 guid_abs = absence.find(get_path(["VacationRequestGUID"])) logging.debug("{}. Abs: {}".format(absenceCounter, guid_abs.text)) if absence_is_relevant(absence) is False: logging.debug("Skip absence") continue logging.debug("Processing absence") relevantCounter += 1 currentRequest['AbsenceList'][relevantCounter] = {} for key in wantedInfos.keys(): if wantedInfos[key]['Abs'] is True: path = get_path(wantedInfos[key]['Path']) value = absence.find(path).text currentRequest['AbsenceList'][relevantCounter][key] = value logging.debug("Store: key: '{}', value: '{}'".format(value, key)) return currentRequest def xml_to_list(): ''' Collects the wanted informations from a xml tree and store these data in a list of dictionarys. returns: list of dictinaries ''' counter = 0 relevant = 0 allVacations = [] # read the xml tree from file and grab tree root try: tree = ET.parse(xmlfile) logging.debug("Reading xml-file {} successful".format(xmlfile)) root = tree.getroot() except Exception as error: logging.error("Failed to get xml-tree root.") logging.error(error) exit(2) # iterate over all vacation requests and check if the request is # relevant. for request in root.findall(get_full_tag("VacationRequests")): counter += 1 logging.debug("* {}. New request found".format(counter)) guid = request.find(get_path(["GUID"])) name = request.find(get_path(["RequestorFullName"])) logging.debug("{}: {}".format(guid.text, name.text)) if request_is_relevant(request) is False: continue else: # if request is relevant call function to grab all needed data. relevant += 1 logging.debug("Relevant entry found: {}".format(relevant)) currentRequest = process_request(request) allVacations.append(currentRequest) logging.info("{} relevant requests found".format(len(allVacations))) return(allVacations) # --- excel --- # def get_cell_name(character, number): ''' Returns an string from column character and row number. param 1: string param 2: integer returns: string ''' return "".join((character.strip(), str(number).strip())) def write_sheet_header(worksheet): ''' Writes the first row in the worksheet. The cell text is the key from wantedInfos. The column character comes from the keys "col" flag. We use characters because worksheet.column_dimensions cannot work with integer. param 1: worksheet object ''' row = 1 boldFont = Font(name="Calibri", bold = True) for key in wantedInfos.keys(): width = wantedInfos[key]["Width"] column = wantedInfos[key]["Col"] worksheet.column_dimensions[column].width = width cell_name = get_cell_name(column, row) cell = worksheet[cell_name] cell.font = boldFont worksheet[cell_name] = key logging.debug("Column {} was set to width {}".format(key, width)) worksheet.freeze_panes = "ZZ2" logging.debug("First row pinned") def write_cell(worksheet, row, key, value): ''' Writes "value" into a cell. The cell name is build from the wantedInfos "Col" flag for "key" and row. param 1: sheet object param 2: integer param 3: string param 4: string ''' # any values needs to convert into an readable format if key == "Art": value = absenceKindDict[value] if key == "Tage": value = int(int(value) / 2) elif key in ("Freigabe1", "Freigabe2", "Freigabe"): value = approvalStatusDict[value] elif key in ("Start", "Ende", "Eingetr."): value = dt.date.fromisoformat(value.split("T")[0]) # write column = wantedInfos[key]['Col'] cell_name = get_cell_name(column, row) worksheet[cell_name] = value if key in ("Start", "Ende", "Eingetr."): worksheet[cell_name].number_format = "dd.mm.yyyy" logging.debug("{}: '{}' written".format(cell_name, value)) def write_excel(vacationList): ''' Becomes the vacation requests list and writes there data into an excel file. param 1: list of dictionaries ''' year = str(wantedYear) if wantedMonth in range(1, 10): month = "".join(("0", str(wantedMonth))) else: month = str(wantedMonth) fileName = "material/urlaub-{}-{}.xls".format(year, month) sheetName = " ".join(("urlaub", year, month)) logging.info("Starting to write excel file {}".format(fileName)) workbook = WB() workbook.iso_dates = True worksheet = workbook.active worksheet.title = sheetName for i in hidden_column: worksheet.column_dimensions[i].hidden = True row = 1 # set width and write header write_sheet_header(worksheet) # process all requests for request in allVacations: row += 1 logging.debug("Write request into row {}: {} {}".format(row, \ request["GUID-R"], request["Name"])) # write the main request data for key in request.keys(): if key == "AbsenceList": continue else: write_cell(worksheet, row, key, request[key]) # then we write the absence data for absence_nr in request["AbsenceList"].keys(): if absence_nr > 1: row += 1 guida = request["AbsenceList"][absence_nr]["GUID-A"] logging.debug("Write absence into row {}: {} {}".format(row, \ absence_nr, guida)) for key in request["AbsenceList"][absence_nr].keys(): value = request["AbsenceList"][absence_nr][key] write_cell(worksheet, row, key, value) logging.info("{} absence request written.".format(row - 1)) try: workbook.save(fileName) logging.info("Excel file successful written") except Exception as error: logging.critical("Failed to write excel file") exit(3) # --- start program --- # xmlfile = "material/urlaubsantraege.xml" ns = "{urn:orkan-software.de:schema:ORKAN}" wantedInfos = { # "Eingetr.": {"Abs": False, "Col": "A", "Width": 12, "Path": ["DateAdd"]}, # "Kurzname": {"Abs": False, "Col": "B", "Width": 15, "Path": ["RequestorMaMatch"]}, "Name": {"Abs": False, "Col": "C", "Width": 25, "Path": ["RequestorFullName"]}, # "Tage": {"Abs": False, "Col": "D", "Width": 8, "Path": ["RequestedHalfDays"]}, #"Typ": {"Abs": True, "Col": "E", "Width": 8, "Path": ["AbsenceType"]}, # "Freigabe1": {"Abs": False, "Col": "E", "Width": 12, "Path": ["Approval1ApprovalStatus"]}, "Freigabe1-Name": {"Abs": False, "Col": "F", "Width": 16, "Path": ["Approval1MaMatch"]}, # "Freigabe2": {"Abs": False, "Col": "G", "Width": 12, "Path": ["Approval2ApprovalStatus"]}, "Freigabe2-Name": {"Abs": False, "Col": "H", "Width": 16, "Path": ["Approval2MaMatch"]}, "Freigabe": {"Abs": True, "Col": "I", "Width": 15, "Path": ["ApprovalStatus"]}, "Art": {"Abs": True, "Col": "J", "Width": 15, "Path": ["AbsenceKindGuid"]}, "Start": {"Abs": True, "Col": "K", "Width": 12, "Path": ["Beginning"]}, "Ende": {"Abs": True, "Col": "L", "Width": 12, "Path": ["Ending"]}, "GUID-A": {"Abs": True, "Col": "M", "Width": 45, "Path": ["VacationRequestGUID"]}, "GUID-R": {"Abs": False, "Col": "N", "Width": 45, "Path": ["GUID"]}, # "Grund": {"Abs": False, "Col": "O", "Width": 40, "Path": ["RejectionReason"]}, # "Kommentar": {"Abs": False, "Col": "P", "Width": 50, "Path": ["RequestorComment"]} } interestedAbsence: dict = { } absenceKindDict = { "{953D3EA0-CB6A-477F-83F1-C1B4939BA0FD}": "Bildungsurlaub", "{DECF84D7-ADD8-4785-AD55-662B8F70C158}": "Lehrgang HWK", "{945EB227-B3E2-4EFF-831E-8D7D1A08281D}": "Resturlaub", "{EFC603E1-5CAF-4153-ADF9-A035440E34CD}": "Sonderurlaub", "{3F43AC39-6132-4397-B236-7D45F9E43019}": "Jahresurlaub" } approvalStatusDict = { '0': "Nicht einger.", '1': "Gelöscht (veraltet)", '2': "Beantragt", '3': "In Bearbeitung", '4': "Abgelehnt", '5': "Gebucht", '6': "Gelöscht" } hidden_column = ("E", "G", "M", "N") loglevel = logging.DEBUG wantedMonth = None wantedYear = None # configure_logging(loglevel) check_doubles() wantedYear, wantedMonth = set_month() allVacations = xml_to_list() write_excel(allVacations)

and the XML file

<VacationRequestsList xmlns="urn:orkan-software.de:schema:ORKAN"> <VacationRequests> <DateAdd>2023-01-22</DateAdd> <RequestorMaMatch>ALPHA</RequestorMaMatch> <ApprovalStatus>5</ApprovalStatus> <FilingDateDate>2023-02-22</FilingDateDate> <RequestedHalfDays>30</RequestedHalfDays> <Approval1RcptTyp>0</Approval1RcptTyp> <Approval1RcptMatch>LAMBDA</Approval1RcptMatch> <Approval1ApprovalStatus>5</Approval1ApprovalStatus> <Approval1TimeStampPutDate>2023-02-23</Approval1TimeStampPutDate> <Approval1MaMatch>LAMBDA</Approval1MaMatch> <Approval2ApprovalStatus>5</Approval2ApprovalStatus> <Approval2TimeStampPutDate>2023-02-23</Approval2TimeStampPutDate> <Approval2MaMatch>KAPPA</Approval2MaMatch> <RequestorFullName>Christian Alpha</RequestorFullName> <Approval1MaFullName>Daniel Lambda</Approval1MaFullName> <Approval2MaFullName>Karsten Kappa</Approval2MaFullName> <AbsenceList> <Absence> <DateAdd>2023-01-22</DateAdd> <MaMatch>ALPHA</MaMatch> <AbsenceType>1</AbsenceType> <AbsenceAccountChange>10</AbsenceAccountChange> <Origin>0</Origin> <Beginning>2023-07-17T00:00:00.00+02:00</Beginning> <BeginningDate>2023-07-17</BeginningDate> <Ending>2023-07-21T23:59:59.99+02:00</Ending> <EndingDate>07:231:07.21</EndingDate> <ApprovalStatus>5</ApprovalStatus> </Absence> <Absence> <DateAdd>2023-01-22</DateAdd> <MaMatch>ALPHA</MaMatch> <AbsenceType>1</AbsenceType> <AbsenceAccountChange>10</AbsenceAccountChange> <Origin>0</Origin> <Beginning>2023-07-24T00:00:00.00+02:00</Beginning> <BeginningDate>2023-07-24</BeginningDate> <Ending>2023-07-28T23:59:59.99+02:00</Ending> <EndingDate>07:231:07.28</EndingDate> <ApprovalStatus>5</ApprovalStatus> </Absence> <Absence> <DateAdd>2023-01-22</DateAdd> <MaMatch>ALPHA</MaMatch> <AbsenceType>1</AbsenceType> <AbsenceAccountChange>10</AbsenceAccountChange> <Origin>0</Origin> <Beginning>2023-07-31T00:00:00.00+02:00</Beginning> <BeginningDate>2023-07-31</BeginningDate> <Ending>2023-08-04T23:59:59.99+02:00</Ending> <EndingDate>07:231:08.04</EndingDate> <ApprovalStatus>5</ApprovalStatus> </Absence> </AbsenceList> </VacationRequests> </VacationRequestsList> ```

Parfait · Accepted Answer · 2023-11-26 23:17:14Z

Consider following tips:

Class: Encapsulate your related def methods for parsing XML and preparing Excel within a Class object to be imported in an end use script that runs full program. This facilitates readability and maintainability even portability. In fact, unit testing becomes easier.
```
class VacationXMLParser(...) ... def init(self, ...): ... def xml_to_list(self): ... class VacationExcelBuilder(...) ... def init(self, ...): ... def write_excel(self): ... 
```
Then calling processes will be better handled:
```
p = VacationXMLParser(xml_file) allVacations = p.xml_to_list() e = VacationExcelBuilder(allVacations) e.write_excel() 
```
Parameters: Parameterize values such as filenames and namespaces within defined method or above proposed class. Do not have underlying methods rely on object declared in global environment. For example, upon first reading ns was an unknown object. Consider passing it in as an argument:
```
def get_full_tag(ns, tag): ''' Adds the namespace to an single xml tag. param 1: string returns: string ''' return "".join((ns, tag)) 
```

If Logic: Avoid the if ... continue logic. Use the counterfactual condition as needed. As example, replace below logic with further below logic:

if request_is_relevant(request) is False: continue else: # if request is relevant call function to grab all needed data. relevant += 1 if request_is_relevant(request): # if request is relevant call function to grab all needed data. relevant += 1

Exit Calls: Avoid explicitly exiting program based on conditions of program execution. Refractor code to cleanly return nothing, raise exceptions, or other robust code flow. With above proposed imported classes you can separate XML and Excel processing and avoid exiting program prematurely.
```
try: tree = ET.parse(xmlfile) logging.debug("Reading xml-file {} successful".format(xmlfile)) root = tree.getroot() except Exception as error: logging.error("Failed to get xml-tree root.") logging.error(error) raise 
```
Try/Except: Related to above, properly release or uninitialize potentially large objects to avoid holding in memory. Use finally block to clear large objects and it will execute before re-raising error (see docs).
```
try: workbook.save(fileName) logging.info("Excel file successful written") except Exception as error: logging.critical("Failed to write excel file") raise finally: workbook = None 
```
Debug Logging: After solid solution in place, avoid the many logging.debug with counter variables in production code which can slow processing and affect performance. Use unit tests on many examples XML files to robustly debug code at each step of process:
```
def test_always_passes(): p = VacationXMLParser(xml_file) ... assert p.request_is_relevant(request) == True def test_always_fails(): p = VacationXMLParser(xml_file) ... assert p.request_is_relevant(request) == False 
```

Pythonic Code: Consider pythonic methods like list/dict comprehensions or conditional expressions:

Specifically, below if/else can be replaced with one line conditional expressions:

if node is None: value = None else: value = node.text value = None if node is None else node.text

Also, building lists can avoid bookkeeping with comprehension:

allVacations = [ process_request(request) for request in root.findall(get_full_tag("VacationRequests")) if request_is_relevant(request) ] ... def get_path(part_list): ... return "/".join(get_full_tag(tag) for tag in part_list)

Even use the latest F-strings for string formatting (introduced in Python 3.6) without the many str.join or long .format calls:

return f"{character.strip()}{str(number).strip()}" ... fileName = os.path.join("material", f"urlaub-{year}-{month}.xls") sheetName = f"urlaub {year} {month}" ... logging.info(f"{len(allVacations)} relevant requests found")

Even replace many if/elif for dictionary:

# any values needs to convert into an readable format values_dict = { "Art": lambda v: absenceKindDict[v], "Tage": lambda v: int(int(v) / 2), "Freigabe1": lambda v: approvalStatusDict[v], "Freigabe2": lambda v: approvalStatusDict[v], "Freigabe": lambda v: approvalStatusDict[v], "Start": lambda v: dt.date.fromisoformat(v.split("T")[0]), "Ende": lambda v: dt.date.fromisoformat(v.split("T")[0]), "Eingetr.": lambda v: dt.date.fromisoformat(v.split("T")[0]) } value = values_dict[key](value)

Maybe you can rewrite above method with conditional expressions returning tuples:

def set_month(): year, month = time.localtime()[0:1] return (year-1, 12) if month == 1 else (year, month-1)

Stack Exchange Network

Python script to read an XML file and write some information to an Excel file

1 Answer 1

Hot Network Questions

Python script to read an XML file and write some information to an Excel file

1 Answer 1

Related

Hot Network Questions