XPS to XLSX

A tool for converting XPS files to XLSX files. Specifically, this maps out the location of data within the XPS and collates based on X and Y coordinates. This is useful for automating data capture fom XPS documents without having to manually copy and paste. Specific areas of the page can be targeted and content pulled.

# The Basics:
import os
import re
import sys
import time
import datetime
import pandas as pd
import fitz
import xlrd
import xlsxwriter


def XPStoXLSX(searchterm, directory, r=False):
    if os.path.isdir(directory):
        all_results = pd.DataFrame()
        for item in os.listdir(directory):
            if os.path.isdir(os.path.join(directory, item)) and r:
                print(os.path.join(directory, item))
                subfolder = XPStoXLSX(searchterm,os.path.join(directory, item), r)
                all_results = all_results.append(subfolder, ignore_index=True)
            elif os.path.isfile(os.path.join(directory, item)) and item[-4:] == ".xps":
                print(item)
                elements = pd.DataFrame(columns=["file", "text", "X0", "Y0", "X1", "Y1"])
                file = fitz.open(os.path.join(directory, item))
                page = file.loadPage()
                location = page.searchFor(searchterm, hit_max=1)
                if len(location) > 0:
                    Y1 = round(location[0].y1, 3)
                    X1 = round(location[0].x1, 3)
                    Y0 = round(location[0].y0, 3)
                    X0 = round(location[0].x0, 3)
                    content = page.getText("dict")
                    for block in content["blocks"]:
                        if "lines" in block.keys():
                            for line in block["lines"]:
                                #print(line["bbox"])
                                for span in line["spans"]:
                                    element = pd.DataFrame({"file": item,
                                                            "text": [span['text'].strip()],
                                                            "X0": [round(line["bbox"][0], 3)],
                                                            "Y0": [round(line["bbox"][1], 3)],
                                                            "X1": [round(line["bbox"][2], 3)],
                                                            "Y1": [round(line["bbox"][3], 3)]})
                                    #print("----------")
                                    elements = elements.append(element, ignore_index=True)
                    high = Y0+5
                    low = Y0-5
                    elements = elements[elements["text"] != '']
                    elements = elements.loc[(elements['Y0'] > low) & (elements['Y0'] < high)]
                    elements = elements.sort_values(by=['X0'])
                    name = item.split(".")[0]
                    rowData = [name,]
                    rowData.extend(elements["text"].tolist())
                    result = pd.DataFrame([rowData,])
                    #print(result)
                    all_results = all_results.append(result, ignore_index=True)
        return all_results
    else:
        return False

Daniel Clarke

Dedicated, Analytic, Professional

Search

Meta