I'm getting an XML file from an application, want to extract some information from that file and write it to an Excel file. The XML file contains data about vacations. I need data from the last month with an approval status 1, 5 or 6.
The following function returns the month and year:
def set_month(): year = time.localtime()[0] month = time.localtime()[1] if month == 1: month = 12 year -= 1 else: month -= 1 return (year, month)
I assume there is a better way to do this.
Below you'll find the complete script and an example XML file. I'd be happy about any ideas to improve the code.
#!/usr/bin/env python3 # file: urlaub.py # desc: Reads a xml file created from another software. This file contains # vacation requests. This program search all vacation requests for the month # ago, grabs the relevant informations and write they into an excel file. # # possible exit status if the program does not crash: # 0: all clear # 1: you hav a double character in your wantedInfos directory # 2: failed to read the xml file or to get the tree root # 3: failed to write the excel file import time import logging from sys import exit import datetime as dt from openpyxl import Workbook as WB from openpyxl.styles import Font import xml.etree.ElementTree as ET # --- logging --- # def configure_logging(loglevel): ''' Configure logging settings. param 1: integer ''' formatstring = "%(levelname)-8s %(message)s" logging.basicConfig(format=formatstring, level=loglevel) logging.info("Loglevel set to {}".format(loglevel)) # --- misk --- # def check_doubles(): ''' Checks for double (column) characters in the wantedInfos dictionary. ''' char_list = [] logging.debug("Check for double cell characters") for dictionary in (wantedInfos, interestedAbsence): for key in dictionary.keys(): character = dictionary[key]["Col"] if character in char_list: logging.error("Double cell character found: {}".format(character)) exit(1) char_list.append(character) logging.debug("Character list {}".format(sorted(char_list))) # --- xml processing --- # def set_month(): ''' Searchs for the month ago. Returns year and month as string. returns: tuple of strings ''' year = time.localtime()[0] month = time.localtime()[1] if month == 1: month = 12 year -= 1 else: month -= 1 logging.info("Search for vacation requests in {} {}".format(month, year)) return (year, month) def absence_is_relevant(absence): ''' Checks if the given absence is relevant. This includes an check for time and also for approval status. param 1: xml element returns: boolean ''' start = absence.find(get_path(["Beginning"])) date = start.text.split("T") logging.debug("Test absence for {}".format(date)) year, month, day = date[0].split("-") if int(year) == wantedYear and int(month) == wantedMonth: logging.debug("Absence is relevant") status = absence.find(get_path(["ApprovalStatus"])) if int(status.text) in (0, 2, 4): logging.debug("Status is {} ... skip.".format(status.text)) return False return True logging.debug("Absence is not relevant") return False def request_is_relevant(request): ''' Checks if the given request is relevant. Walks through the absence list and calls absence_is_relevant for every absence found. param 1: xml tree element returns: boolean ''' guid_main = request.find(get_path(["GUID"])) name = request.find(get_path(["RequestorFullName"])) logging.debug("Test for relevanz: {}".format(guid_main.text)) for absence in request.findall(get_path(["AbsenceList", "Absence"])): if absence_is_relevant(absence) is True: logging.debug("Request is relevant") return True logging.debug("Request is not relevant") return False def get_path(part_list): ''' Because i cannot access the data only with the tag name, we add the namespace for every tree level. param 1: list returns: string ''' parts = [] for tag in part_list: parts.append(get_full_tag(tag)) path = "/".join(parts) return path def get_full_tag(tag): ''' Adds the namespace to an single xml tag. param 1: string returns: string ''' return "".join((ns, tag)) def crop_namespace(node): ''' param 1: string returns: string or none ''' if node is None: return None return node.tag.split("}")[1].strip() def process_request(request): ''' Processes the given request. We walk two times through all keys from the wantedInfos dictionary. First time we collect all data for keys flagged with "Abs" is false. Second time we walk through all branches from absence list. If the absence is relevant, we collect all data for keys flagged with true. These data will be stored in a subdictionary with absenceCounter as key. param 1: xml-tree object returns: dictionary ''' currentRequest: dict = {'AbsenceList': {}, } relevantCounter = 0 absenceCounter = 0 guid_main = request.find(get_path(["GUID"])) name = request.find(get_path(["RequestorFullName"])) logging.debug("Processing {}: {}".format(guid_main.text, name.text)) # collect request level information for key in wantedInfos.keys(): if wantedInfos[key]['Abs'] is False: path = get_path(wantedInfos[key]["Path"]) logging.debug("Search for key '{}' '{}'".format(key, path)) node = request.find(path) logging.debug("Found node '{}'".format(crop_namespace(node))) if node is None: value = None else: value = node.text currentRequest[key] = value logging.debug("Store: key: {}, value: '{}'".format(key, value)) # walk through absence list and collect absence level informations for absence in request.findall(get_path(["AbsenceList", "Absence"])): absenceCounter += 1 guid_abs = absence.find(get_path(["VacationRequestGUID"])) logging.debug("{}. Abs: {}".format(absenceCounter, guid_abs.text)) if absence_is_relevant(absence) is False: logging.debug("Skip absence") continue logging.debug("Processing absence") relevantCounter += 1 currentRequest['AbsenceList'][relevantCounter] = {} for key in wantedInfos.keys(): if wantedInfos[key]['Abs'] is True: path = get_path(wantedInfos[key]['Path']) value = absence.find(path).text currentRequest['AbsenceList'][relevantCounter][key] = value logging.debug("Store: key: '{}', value: '{}'".format(value, key)) return currentRequest def xml_to_list(): ''' Collects the wanted informations from a xml tree and store these data in a list of dictionarys. returns: list of dictinaries ''' counter = 0 relevant = 0 allVacations = [] # read the xml tree from file and grab tree root try: tree = ET.parse(xmlfile) logging.debug("Reading xml-file {} successful".format(xmlfile)) root = tree.getroot() except Exception as error: logging.error("Failed to get xml-tree root.") logging.error(error) exit(2) # iterate over all vacation requests and check if the request is # relevant. for request in root.findall(get_full_tag("VacationRequests")): counter += 1 logging.debug("* {}. New request found".format(counter)) guid = request.find(get_path(["GUID"])) name = request.find(get_path(["RequestorFullName"])) logging.debug("{}: {}".format(guid.text, name.text)) if request_is_relevant(request) is False: continue else: # if request is relevant call function to grab all needed data. relevant += 1 logging.debug("Relevant entry found: {}".format(relevant)) currentRequest = process_request(request) allVacations.append(currentRequest) logging.info("{} relevant requests found".format(len(allVacations))) return(allVacations) # --- excel --- # def get_cell_name(character, number): ''' Returns an string from column character and row number. param 1: string param 2: integer returns: string ''' return "".join((character.strip(), str(number).strip())) def write_sheet_header(worksheet): ''' Writes the first row in the worksheet. The cell text is the key from wantedInfos. The column character comes from the keys "col" flag. We use characters because worksheet.column_dimensions cannot work with integer. param 1: worksheet object ''' row = 1 boldFont = Font(name="Calibri", bold = True) for key in wantedInfos.keys(): width = wantedInfos[key]["Width"] column = wantedInfos[key]["Col"] worksheet.column_dimensions[column].width = width cell_name = get_cell_name(column, row) cell = worksheet[cell_name] cell.font = boldFont worksheet[cell_name] = key logging.debug("Column {} was set to width {}".format(key, width)) worksheet.freeze_panes = "ZZ2" logging.debug("First row pinned") def write_cell(worksheet, row, key, value): ''' Writes "value" into a cell. The cell name is build from the wantedInfos "Col" flag for "key" and row. param 1: sheet object param 2: integer param 3: string param 4: string ''' # any values needs to convert into an readable format if key == "Art": value = absenceKindDict[value] if key == "Tage": value = int(int(value) / 2) elif key in ("Freigabe1", "Freigabe2", "Freigabe"): value = approvalStatusDict[value] elif key in ("Start", "Ende", "Eingetr."): value = dt.date.fromisoformat(value.split("T")[0]) # write column = wantedInfos[key]['Col'] cell_name = get_cell_name(column, row) worksheet[cell_name] = value if key in ("Start", "Ende", "Eingetr."): worksheet[cell_name].number_format = "dd.mm.yyyy" logging.debug("{}: '{}' written".format(cell_name, value)) def write_excel(vacationList): ''' Becomes the vacation requests list and writes there data into an excel file. param 1: list of dictionaries ''' year = str(wantedYear) if wantedMonth in range(1, 10): month = "".join(("0", str(wantedMonth))) else: month = str(wantedMonth) fileName = "material/urlaub-{}-{}.xls".format(year, month) sheetName = " ".join(("urlaub", year, month)) logging.info("Starting to write excel file {}".format(fileName)) workbook = WB() workbook.iso_dates = True worksheet = workbook.active worksheet.title = sheetName for i in hidden_column: worksheet.column_dimensions[i].hidden = True row = 1 # set width and write header write_sheet_header(worksheet) # process all requests for request in allVacations: row += 1 logging.debug("Write request into row {}: {} {}".format(row, \ request["GUID-R"], request["Name"])) # write the main request data for key in request.keys(): if key == "AbsenceList": continue else: write_cell(worksheet, row, key, request[key]) # then we write the absence data for absence_nr in request["AbsenceList"].keys(): if absence_nr > 1: row += 1 guida = request["AbsenceList"][absence_nr]["GUID-A"] logging.debug("Write absence into row {}: {} {}".format(row, \ absence_nr, guida)) for key in request["AbsenceList"][absence_nr].keys(): value = request["AbsenceList"][absence_nr][key] write_cell(worksheet, row, key, value) logging.info("{} absence request written.".format(row - 1)) try: workbook.save(fileName) logging.info("Excel file successful written") except Exception as error: logging.critical("Failed to write excel file") exit(3) # --- start program --- # xmlfile = "material/urlaubsantraege.xml" ns = "{urn:orkan-software.de:schema:ORKAN}" wantedInfos = { # "Eingetr.": {"Abs": False, "Col": "A", "Width": 12, "Path": ["DateAdd"]}, # "Kurzname": {"Abs": False, "Col": "B", "Width": 15, "Path": ["RequestorMaMatch"]}, "Name": {"Abs": False, "Col": "C", "Width": 25, "Path": ["RequestorFullName"]}, # "Tage": {"Abs": False, "Col": "D", "Width": 8, "Path": ["RequestedHalfDays"]}, #"Typ": {"Abs": True, "Col": "E", "Width": 8, "Path": ["AbsenceType"]}, # "Freigabe1": {"Abs": False, "Col": "E", "Width": 12, "Path": ["Approval1ApprovalStatus"]}, "Freigabe1-Name": {"Abs": False, "Col": "F", "Width": 16, "Path": ["Approval1MaMatch"]}, # "Freigabe2": {"Abs": False, "Col": "G", "Width": 12, "Path": ["Approval2ApprovalStatus"]}, "Freigabe2-Name": {"Abs": False, "Col": "H", "Width": 16, "Path": ["Approval2MaMatch"]}, "Freigabe": {"Abs": True, "Col": "I", "Width": 15, "Path": ["ApprovalStatus"]}, "Art": {"Abs": True, "Col": "J", "Width": 15, "Path": ["AbsenceKindGuid"]}, "Start": {"Abs": True, "Col": "K", "Width": 12, "Path": ["Beginning"]}, "Ende": {"Abs": True, "Col": "L", "Width": 12, "Path": ["Ending"]}, "GUID-A": {"Abs": True, "Col": "M", "Width": 45, "Path": ["VacationRequestGUID"]}, "GUID-R": {"Abs": False, "Col": "N", "Width": 45, "Path": ["GUID"]}, # "Grund": {"Abs": False, "Col": "O", "Width": 40, "Path": ["RejectionReason"]}, # "Kommentar": {"Abs": False, "Col": "P", "Width": 50, "Path": ["RequestorComment"]} } interestedAbsence: dict = { } absenceKindDict = { "{953D3EA0-CB6A-477F-83F1-C1B4939BA0FD}": "Bildungsurlaub", "{DECF84D7-ADD8-4785-AD55-662B8F70C158}": "Lehrgang HWK", "{945EB227-B3E2-4EFF-831E-8D7D1A08281D}": "Resturlaub", "{EFC603E1-5CAF-4153-ADF9-A035440E34CD}": "Sonderurlaub", "{3F43AC39-6132-4397-B236-7D45F9E43019}": "Jahresurlaub" } approvalStatusDict = { '0': "Nicht einger.", '1': "Gelöscht (veraltet)", '2': "Beantragt", '3': "In Bearbeitung", '4': "Abgelehnt", '5': "Gebucht", '6': "Gelöscht" } hidden_column = ("E", "G", "M", "N") loglevel = logging.DEBUG wantedMonth = None wantedYear = None # configure_logging(loglevel) check_doubles() wantedYear, wantedMonth = set_month() allVacations = xml_to_list() write_excel(allVacations)
and the XML file
<VacationRequestsList xmlns="urn:orkan-software.de:schema:ORKAN"> <VacationRequests> <DateAdd>2023-01-22</DateAdd> <RequestorMaMatch>ALPHA</RequestorMaMatch> <ApprovalStatus>5</ApprovalStatus> <FilingDateDate>2023-02-22</FilingDateDate> <RequestedHalfDays>30</RequestedHalfDays> <Approval1RcptTyp>0</Approval1RcptTyp> <Approval1RcptMatch>LAMBDA</Approval1RcptMatch> <Approval1ApprovalStatus>5</Approval1ApprovalStatus> <Approval1TimeStampPutDate>2023-02-23</Approval1TimeStampPutDate> <Approval1MaMatch>LAMBDA</Approval1MaMatch> <Approval2ApprovalStatus>5</Approval2ApprovalStatus> <Approval2TimeStampPutDate>2023-02-23</Approval2TimeStampPutDate> <Approval2MaMatch>KAPPA</Approval2MaMatch> <RequestorFullName>Christian Alpha</RequestorFullName> <Approval1MaFullName>Daniel Lambda</Approval1MaFullName> <Approval2MaFullName>Karsten Kappa</Approval2MaFullName> <AbsenceList> <Absence> <DateAdd>2023-01-22</DateAdd> <MaMatch>ALPHA</MaMatch> <AbsenceType>1</AbsenceType> <AbsenceAccountChange>10</AbsenceAccountChange> <Origin>0</Origin> <Beginning>2023-07-17T00:00:00.00+02:00</Beginning> <BeginningDate>2023-07-17</BeginningDate> <Ending>2023-07-21T23:59:59.99+02:00</Ending> <EndingDate>07:231:07.21</EndingDate> <ApprovalStatus>5</ApprovalStatus> </Absence> <Absence> <DateAdd>2023-01-22</DateAdd> <MaMatch>ALPHA</MaMatch> <AbsenceType>1</AbsenceType> <AbsenceAccountChange>10</AbsenceAccountChange> <Origin>0</Origin> <Beginning>2023-07-24T00:00:00.00+02:00</Beginning> <BeginningDate>2023-07-24</BeginningDate> <Ending>2023-07-28T23:59:59.99+02:00</Ending> <EndingDate>07:231:07.28</EndingDate> <ApprovalStatus>5</ApprovalStatus> </Absence> <Absence> <DateAdd>2023-01-22</DateAdd> <MaMatch>ALPHA</MaMatch> <AbsenceType>1</AbsenceType> <AbsenceAccountChange>10</AbsenceAccountChange> <Origin>0</Origin> <Beginning>2023-07-31T00:00:00.00+02:00</Beginning> <BeginningDate>2023-07-31</BeginningDate> <Ending>2023-08-04T23:59:59.99+02:00</Ending> <EndingDate>07:231:08.04</EndingDate> <ApprovalStatus>5</ApprovalStatus> </Absence> </AbsenceList> </VacationRequests> </VacationRequestsList> ```