A tool for converting XPS files to XLSX files. Specifically, this maps out the location of data within the XPS and collates based on X and Y coordinates. This is useful for automating data capture fom XPS documents without having to manually copy and paste. Specific areas of the page can be targeted and content pulled.
# The Basics:
import os
import re
import sys
import time
import datetime
import pandas as pd
import fitz
import xlrd
import xlsxwriter
def XPStoXLSX(searchterm, directory, r=False):
if os.path.isdir(directory):
all_results = pd.DataFrame()
for item in os.listdir(directory):
if os.path.isdir(os.path.join(directory, item)) and r:
print(os.path.join(directory, item))
subfolder = XPStoXLSX(searchterm,os.path.join(directory, item), r)
all_results = all_results.append(subfolder, ignore_index=True)
elif os.path.isfile(os.path.join(directory, item)) and item[-4:] == ".xps":
print(item)
elements = pd.DataFrame(columns=["file", "text", "X0", "Y0", "X1", "Y1"])
file = fitz.open(os.path.join(directory, item))
page = file.loadPage()
location = page.searchFor(searchterm, hit_max=1)
if len(location) > 0:
Y1 = round(location[0].y1, 3)
X1 = round(location[0].x1, 3)
Y0 = round(location[0].y0, 3)
X0 = round(location[0].x0, 3)
content = page.getText("dict")
for block in content["blocks"]:
if "lines" in block.keys():
for line in block["lines"]:
#print(line["bbox"])
for span in line["spans"]:
element = pd.DataFrame({"file": item,
"text": [span['text'].strip()],
"X0": [round(line["bbox"][0], 3)],
"Y0": [round(line["bbox"][1], 3)],
"X1": [round(line["bbox"][2], 3)],
"Y1": [round(line["bbox"][3], 3)]})
#print("----------")
elements = elements.append(element, ignore_index=True)
high = Y0+5
low = Y0-5
elements = elements[elements["text"] != '']
elements = elements.loc[(elements['Y0'] > low) & (elements['Y0'] < high)]
elements = elements.sort_values(by=['X0'])
name = item.split(".")[0]
rowData = [name,]
rowData.extend(elements["text"].tolist())
result = pd.DataFrame([rowData,])
#print(result)
all_results = all_results.append(result, ignore_index=True)
return all_results
else:
return False
