A tool for converting XPS files to XLSX files. Specifically, this maps out the location of data within the XPS and collates based on X and Y coordinates. This is useful for automating data capture fom XPS documents without having to manually copy and paste. Specific areas of the page can be targeted and content pulled.
# The Basics: import os import re import sys import time import datetime import pandas as pd import fitz import xlrd import xlsxwriter def XPStoXLSX(searchterm, directory, r=False): if os.path.isdir(directory): all_results = pd.DataFrame() for item in os.listdir(directory): if os.path.isdir(os.path.join(directory, item)) and r: print(os.path.join(directory, item)) subfolder = XPStoXLSX(searchterm,os.path.join(directory, item), r) all_results = all_results.append(subfolder, ignore_index=True) elif os.path.isfile(os.path.join(directory, item)) and item[-4:] == ".xps": print(item) elements = pd.DataFrame(columns=["file", "text", "X0", "Y0", "X1", "Y1"]) file = fitz.open(os.path.join(directory, item)) page = file.loadPage() location = page.searchFor(searchterm, hit_max=1) if len(location) > 0: Y1 = round(location[0].y1, 3) X1 = round(location[0].x1, 3) Y0 = round(location[0].y0, 3) X0 = round(location[0].x0, 3) content = page.getText("dict") for block in content["blocks"]: if "lines" in block.keys(): for line in block["lines"]: #print(line["bbox"]) for span in line["spans"]: element = pd.DataFrame({"file": item, "text": [span['text'].strip()], "X0": [round(line["bbox"][0], 3)], "Y0": [round(line["bbox"][1], 3)], "X1": [round(line["bbox"][2], 3)], "Y1": [round(line["bbox"][3], 3)]}) #print("----------") elements = elements.append(element, ignore_index=True) high = Y0+5 low = Y0-5 elements = elements[elements["text"] != ''] elements = elements.loc[(elements['Y0'] > low) & (elements['Y0'] < high)] elements = elements.sort_values(by=['X0']) name = item.split(".")[0] rowData = [name,] rowData.extend(elements["text"].tolist()) result = pd.DataFrame([rowData,]) #print(result) all_results = all_results.append(result, ignore_index=True) return all_results else: return False