I've written the following code to extract the data I need from invoices which are in pdf format but can vary in terms of layout and terminology:
(*Step 1:Prepare Training Data*)
trainingDataItemDesc = {
"Description" -> "SimilarString", "DESCRIPTION" -> "SimilarString",
"Product Name" -> "SimilarString",
"Product Details" -> "SimilarString", "Products" -> "SimilarString",
"Product" -> "SimilarString", "Details" -> "SimilarString",
"Desc" -> "SimilarString", "ITEM" -> "SimilarString",
"Product Description" -> "SimilarString", "" -> "SimilarString",
"Item Description" -> "SimilarString", "ITEMS" -> "SimilarString",
"Description of Goods Sold" -> "SimilarString",
"Part Number/Description" -> "SimilarString",
"PRODUCT NAME" -> "SimilarString",
"PRODUCT DETAILS" -> "SimilarString", "PRODUCTS" -> "SimilarString",
"PRODUCT" -> "SimilarString", "DETAILS" -> "SimilarString",
"DESC" -> "SimilarString", "PRODUCT DESCRIPTION" -> "SimilarString",
"ITEM DESCRIPTION" -> "SimilarString",
"DESCRIPTION OF GOODS SOLD" -> "SimilarString",
"PART NUMBER/DESCRIPTION" -> "SimilarString",
"description" -> "SimilarString", "product name" -> "SimilarString",
"product details" -> "SimilarString",
"products" -> "SimilarString", "product" -> "SimilarString",
"details" -> "SimilarString", "desc" -> "SimilarString",
"item" -> "SimilarString", "product description" -> "SimilarString",
"item description" -> "SimilarString", "items" -> "SimilarString",
"description of goods sold" -> "SimilarString",
"part number/description" -> "SimilarString",
"Description:" -> "SimilarString",
"DESCRIPTION:" -> "SimilarString",
"Product Name:" -> "SimilarString",
"Product Details:" -> "SimilarString",
"Products:" -> "SimilarString", "Product:" -> "SimilarString",
"Details:" -> "SimilarString", "Desc:" -> "SimilarString",
"ITEM:" -> "SimilarString",
"Product Description:" -> "SimilarString",
"Item Description:" -> "SimilarString", "ITEMS:" -> "SimilarString",
"Description of Goods Sold:" -> "SimilarString",
"Part Number/Description:" -> "SimilarString",
"Total" -> "NotSimilar", "TOTAL REMITTANCE:" -> "NotSimilar",
"[GBP] TOTAL" -> "NotSimilar", "Total:" -> "NotSimilar",
"Grand Total:" -> "NotSimilar", "Invoice total" -> "NotSimilar",
"Total VAT" -> "NotSimilar", "Total Paid" -> "NotSimilar",
"Amount Due:" -> "NotSimilar", "AMOUNT (USD)" -> "NotSimilar",
"Order Total" -> "NotSimilar", "" -> "NotSimilar",
"Invoice Total:" -> "NotSimilar", "Gross Total" -> "NotSimilar",
"Net Price (per unit)" -> "NotSimilar", "TOTAL" -> "NotSimilar",
"Total paid" -> "NotSimilar", "Invoice Total" -> "NotSimilar",
"TOTAL INC. VAT:" -> "NotSimilar",
"Total Amount Due" -> "NotSimilar", "TOTAL:" -> "NotSimilar",
"GRAND TOTAL:" -> "NotSimilar", "INVOICE TOTAL" -> "NotSimilar",
"TOTAL VAT" -> "NotSimilar", "TOTAL PAID" -> "NotSimilar",
"AMOUNT DUE:" -> "NotSimilar", "ORDER TOTAL" -> "NotSimilar",
"INVOICE TOTAL:" -> "NotSimilar", "GROSS TOTAL" -> "NotSimilar",
"NET PRICE (PER UNIT)" -> "NotSimilar",
"TOTAL AMOUNT DUE" -> "NotSimilar", "total" -> "NotSimilar",
"total remittance:" -> "NotSimilar", "[gbp] total" -> "NotSimilar",
"total:" -> "NotSimilar", "grand total:" -> "NotSimilar",
"invoice total" -> "NotSimilar", "total vat" -> "NotSimilar",
"total paid" -> "NotSimilar", "amount due:" -> "NotSimilar",
"amount (usd)" -> "NotSimilar", "order total" -> "NotSimilar",
"invoice total:" -> "NotSimilar", "gross total" -> "NotSimilar",
"net price (per unit)" -> "NotSimilar",
"total inc. vat:" -> "NotSimilar",
"total amount due" -> "NotSimilar", "[GBP] TOTAL:" -> "NotSimilar",
"Invoice total:" -> "NotSimilar", "Total VAT:" -> "NotSimilar",
"Total Paid:" -> "NotSimilar", "AMOUNT (USD):" -> "NotSimilar",
"Order Total:" -> "NotSimilar", "Gross Total:" -> "NotSimilar",
"Net Price (per unit):" -> "NotSimilar",
"Total paid:" -> "NotSimilar",
"Total Amount Due:" -> "NotSimilar"}; trainingDataSubtot = {
"Subtotal" -> "SimilarString", "Nett Total:" -> "SimilarString",
"COMPONENT TOTAL" -> "SimilarString",
"Sub-Total:" -> "SimilarString", "Sub Total" -> "SimilarString",
"Subtotal:" -> "SimilarString", "" -> "SimilarString",
"Total goods ex VAT" -> "SimilarString",
"Sub-Total" -> "SimilarString",
"Net value of goods" -> "SimilarString",
"Total Net Amount" -> "SimilarString",
"Net Total:" -> "SimilarString", "Sub-total" -> "SimilarString",
"Goods Value" -> "SimilarString", "Products" -> "SimilarString",
"SUB-TOTAL:" -> "SimilarString", "Net Amount" -> "SimilarString",
"Nett Total" -> "SimilarString",
"Total Nett Goods" -> "SimilarString",
"Invoice Total" -> "SimilarString", "SUBTOTAL" -> "SimilarString",
"Net total" -> "SimilarString", "NETT TOTAL:" -> "SimilarString",
"SUB TOTAL" -> "SimilarString", "SUBTOTAL:" -> "SimilarString",
"TOTAL GOODS EX VAT" -> "SimilarString",
"SUB-TOTAL" -> "SimilarString",
"NET VALUE OF GOODS" -> "SimilarString",
"TOTAL NET AMOUNT" -> "SimilarString",
"NET TOTAL:" -> "SimilarString", "GOODS VALUE" -> "SimilarString",
"PRODUCTS" -> "SimilarString", "NET AMOUNT" -> "SimilarString",
"NETT TOTAL" -> "SimilarString",
"TOTAL NETT GOODS" -> "SimilarString",
"INVOICE TOTAL" -> "SimilarString", "NET TOTAL" -> "SimilarString",
"subtotal" -> "SimilarString", "nett total:" -> "SimilarString",
"component total" -> "SimilarString",
"sub-total:" -> "SimilarString", "sub total" -> "SimilarString",
"subtotal:" -> "SimilarString",
"total goods ex vat" -> "SimilarString",
"sub-total" -> "SimilarString",
"net value of goods" -> "SimilarString",
"total net amount" -> "SimilarString",
"net total:" -> "SimilarString", "goods value" -> "SimilarString",
"products" -> "SimilarString", "net amount" -> "SimilarString",
"nett total" -> "SimilarString",
"total nett goods" -> "SimilarString",
"invoice total" -> "SimilarString", "net total" -> "SimilarString",
"COMPONENT TOTAL:" -> "SimilarString",
"Sub Total:" -> "SimilarString",
"Total goods ex VAT:" -> "SimilarString",
"Net value of goods:" -> "SimilarString",
"Total Net Amount:" -> "SimilarString",
"Sub-total:" -> "SimilarString", "Goods Value:" -> "SimilarString",
"Products:" -> "SimilarString", "Net Amount:" -> "SimilarString",
"Total Nett Goods:" -> "SimilarString",
"Invoice Total:" -> "SimilarString", "Net total:" -> "SimilarString",
"Total" -> "NotSimilar", "TOTAL REMITTANCE:" -> "NotSimilar",
"[GBP] TOTAL" -> "NotSimilar", "Total:" -> "NotSimilar",
"Grand Total:" -> "NotSimilar", "Invoice total" -> "NotSimilar",
"Total VAT" -> "NotSimilar", "Total Paid" -> "NotSimilar",
"Amount Due:" -> "NotSimilar", "AMOUNT (USD)" -> "NotSimilar",
"Order Total" -> "NotSimilar", "" -> "NotSimilar",
"Invoice Total:" -> "NotSimilar", "Gross Total" -> "NotSimilar",
"Net Price (per unit)" -> "NotSimilar", "TOTAL" -> "NotSimilar",
"Total paid" -> "NotSimilar", "Invoice Total" -> "NotSimilar",
"TOTAL INC. VAT:" -> "NotSimilar",
"Total Amount Due" -> "NotSimilar", "TOTAL:" -> "NotSimilar",
"GRAND TOTAL:" -> "NotSimilar", "INVOICE TOTAL" -> "NotSimilar",
"TOTAL VAT" -> "NotSimilar", "TOTAL PAID" -> "NotSimilar",
"AMOUNT DUE:" -> "NotSimilar", "ORDER TOTAL" -> "NotSimilar",
"INVOICE TOTAL:" -> "NotSimilar", "GROSS TOTAL" -> "NotSimilar",
"NET PRICE (PER UNIT)" -> "NotSimilar",
"TOTAL AMOUNT DUE" -> "NotSimilar", "total" -> "NotSimilar",
"total remittance:" -> "NotSimilar", "[gbp] total" -> "NotSimilar",
"total:" -> "NotSimilar", "grand total:" -> "NotSimilar",
"invoice total" -> "NotSimilar", "total vat" -> "NotSimilar",
"total paid" -> "NotSimilar", "amount due:" -> "NotSimilar",
"amount (usd)" -> "NotSimilar", "order total" -> "NotSimilar",
"invoice total:" -> "NotSimilar", "gross total" -> "NotSimilar",
"net price (per unit)" -> "NotSimilar",
"total inc. vat:" -> "NotSimilar",
"total amount due" -> "NotSimilar", "[GBP] TOTAL:" -> "NotSimilar",
"Invoice total:" -> "NotSimilar", "Total VAT:" -> "NotSimilar",
"Total Paid:" -> "NotSimilar", "AMOUNT (USD):" -> "NotSimilar",
"Order Total:" -> "NotSimilar", "Gross Total:" -> "NotSimilar",
"Net Price (per unit):" -> "NotSimilar",
"Total paid:" -> "NotSimilar", "Total Amount Due:" -> "NotSimilar"};
trainingDataVAT = {
"Tax" -> "SimilarString", "VAT 20.0 - 20%:" -> "SimilarString",
"TOTAL TAX" -> "SimilarString", "VAT (20%):" -> "SimilarString",
"Tax:" -> "SimilarString", "" -> "SimilarString",
"VAT @ 20.00%" -> "SimilarString", "VAT (20%)" -> "SimilarString",
"Total Vat:" -> "SimilarString", "20 % VAT" -> "SimilarString",
"Total VAT" -> "SimilarString", "VAT:" -> "SimilarString",
"VAT" -> "SimilarString", "VAT Value" -> "SimilarString",
"Total Tax paid" -> "SimilarString",
"Total VAT Amount" -> "SimilarString",
"VAT 20%:" -> "SimilarString", "VAT Amount" -> "SimilarString",
"VAT Total" -> "SimilarString", "Invoice Total" -> "SimilarString",
"VAT TOTAL" -> "SimilarString", "V.A.T" -> "SimilarString",
"TAX" -> "SimilarString", "TAX:" -> "SimilarString",
"TOTAL VAT:" -> "SimilarString", "TOTAL VAT" -> "SimilarString",
"VAT VALUE" -> "SimilarString",
"TOTAL TAX PAID" -> "SimilarString",
"TOTAL VAT AMOUNT" -> "SimilarString",
"VAT AMOUNT" -> "SimilarString",
"INVOICE TOTAL" -> "SimilarString", "tax" -> "SimilarString",
"vat 20.0 - 20%:" -> "SimilarString",
"total tax" -> "SimilarString", "vat (20%):" -> "SimilarString",
"tax:" -> "SimilarString", "vat @ 20.00%" -> "SimilarString",
"vat (20%)" -> "SimilarString", "total vat:" -> "SimilarString",
"20 % vat" -> "SimilarString", "total vat" -> "SimilarString",
"vat:" -> "SimilarString", "vat" -> "SimilarString",
"vat value" -> "SimilarString",
"total tax paid" -> "SimilarString",
"total vat amount" -> "SimilarString",
"vat 20%:" -> "SimilarString", "vat amount" -> "SimilarString",
"vat total" -> "SimilarString", "invoice total" -> "SimilarString",
"v.a.t" -> "SimilarString", "TOTAL TAX:" -> "SimilarString",
"VAT @ 20.00%:" -> "SimilarString", "20 % VAT:" -> "SimilarString",
"Total VAT:" -> "SimilarString", "VAT Value:" -> "SimilarString",
"Total Tax paid:" -> "SimilarString",
"Total VAT Amount:" -> "SimilarString",
"VAT Amount:" -> "SimilarString", "VAT Total:" -> "SimilarString",
"Invoice Total:" -> "SimilarString",
"VAT TOTAL:" -> "SimilarString", "V.A.T:" -> "SimilarString",
"Subtotal" -> "NotSimilar", "Nett Total:" -> "NotSimilar",
"COMPONENT TOTAL" -> "NotSimilar", "Sub-Total:" -> "NotSimilar",
"Sub Total" -> "NotSimilar", "Subtotal:" -> "NotSimilar",
"" -> "NotSimilar", "Total goods ex VAT" -> "NotSimilar",
"Sub-Total" -> "NotSimilar", "Net value of goods" -> "NotSimilar",
"Total Net Amount" -> "NotSimilar", "Net Total:" -> "NotSimilar",
"Sub-total" -> "NotSimilar", "Goods Value" -> "NotSimilar",
"Products" -> "NotSimilar", "SUB-TOTAL:" -> "NotSimilar",
"Net Amount" -> "NotSimilar", "Nett Total" -> "NotSimilar",
"Total Nett Goods" -> "NotSimilar",
"Invoice Total" -> "NotSimilar", "SUBTOTAL" -> "NotSimilar",
"Net total" -> "NotSimilar", "NETT TOTAL:" -> "NotSimilar",
"SUB TOTAL" -> "NotSimilar", "SUBTOTAL:" -> "NotSimilar",
"TOTAL GOODS EX VAT" -> "NotSimilar", "SUB-TOTAL" -> "NotSimilar",
"NET VALUE OF GOODS" -> "NotSimilar",
"TOTAL NET AMOUNT" -> "NotSimilar", "NET TOTAL:" -> "NotSimilar",
"GOODS VALUE" -> "NotSimilar", "PRODUCTS" -> "NotSimilar",
"NET AMOUNT" -> "NotSimilar", "NETT TOTAL" -> "NotSimilar",
"TOTAL NETT GOODS" -> "NotSimilar",
"INVOICE TOTAL" -> "NotSimilar", "NET TOTAL" -> "NotSimilar",
"subtotal" -> "NotSimilar", "nett total:" -> "NotSimilar",
"component total" -> "NotSimilar", "sub-total:" -> "NotSimilar",
"sub total" -> "NotSimilar", "subtotal:" -> "NotSimilar",
"total goods ex vat" -> "NotSimilar", "sub-total" -> "NotSimilar",
"net value of goods" -> "NotSimilar",
"total net amount" -> "NotSimilar", "net total:" -> "NotSimilar",
"goods value" -> "NotSimilar", "products" -> "NotSimilar",
"net amount" -> "NotSimilar", "nett total" -> "NotSimilar",
"total nett goods" -> "NotSimilar",
"invoice total" -> "NotSimilar", "net total" -> "NotSimilar",
"COMPONENT TOTAL:" -> "NotSimilar", "Sub Total:" -> "NotSimilar",
"Total goods ex VAT:" -> "NotSimilar",
"Net value of goods:" -> "NotSimilar",
"Total Net Amount:" -> "NotSimilar", "Sub-total:" -> "NotSimilar",
"Goods Value:" -> "NotSimilar", "Products:" -> "NotSimilar",
"Net Amount:" -> "NotSimilar", "Total Nett Goods:" -> "NotSimilar",
"Invoice Total:" -> "NotSimilar", "Net total:" -> "NotSimilar"};
trainingDataTotal = {
"Total" -> "SimilarString", "TOTAL REMITTANCE:" -> "SimilarString",
"[GBP] TOTAL" -> "SimilarString", "Total:" -> "SimilarString",
"Grand Total:" -> "SimilarString",
"Invoice total" -> "SimilarString", "Total VAT" -> "SimilarString",
"Total Paid" -> "SimilarString", "Amount Due:" -> "SimilarString",
"AMOUNT (USD)" -> "SimilarString",
"Order Total" -> "SimilarString", "" -> "SimilarString",
"Invoice Total:" -> "SimilarString",
"Gross Total" -> "SimilarString",
"Net Price (per unit)" -> "SimilarString",
"TOTAL" -> "SimilarString", "Total paid" -> "SimilarString",
"Invoice Total" -> "SimilarString",
"TOTAL INC. VAT:" -> "SimilarString",
"Total Amount Due" -> "SimilarString", "TOTAL:" -> "SimilarString",
"GRAND TOTAL:" -> "SimilarString",
"INVOICE TOTAL" -> "SimilarString", "TOTAL VAT" -> "SimilarString",
"TOTAL PAID" -> "SimilarString", "AMOUNT DUE:" -> "SimilarString",
"ORDER TOTAL" -> "SimilarString",
"INVOICE TOTAL:" -> "SimilarString",
"GROSS TOTAL" -> "SimilarString",
"NET PRICE (PER UNIT)" -> "SimilarString",
"TOTAL AMOUNT DUE" -> "SimilarString", "total" -> "SimilarString",
"total remittance:" -> "SimilarString",
"[gbp] total" -> "SimilarString", "total:" -> "SimilarString",
"grand total:" -> "SimilarString",
"invoice total" -> "SimilarString", "total vat" -> "SimilarString",
"total paid" -> "SimilarString", "amount due:" -> "SimilarString",
"amount (usd)" -> "SimilarString",
"order total" -> "SimilarString",
"invoice total:" -> "SimilarString",
"gross total" -> "SimilarString",
"net price (per unit)" -> "SimilarString",
"total inc. vat:" -> "SimilarString",
"total amount due" -> "SimilarString",
"[GBP] TOTAL:" -> "SimilarString",
"Invoice total:" -> "SimilarString",
"Total VAT:" -> "SimilarString", "Total Paid:" -> "SimilarString",
"AMOUNT (USD):" -> "SimilarString",
"Order Total:" -> "SimilarString",
"Gross Total:" -> "SimilarString",
"Net Price (per unit):" -> "SimilarString",
"Total paid:" -> "SimilarString",
"Total Amount Due:" -> "SimilarString", "Subtotal" -> "NotSimilar",
"Nett Total:" -> "NotSimilar", "COMPONENT TOTAL" -> "NotSimilar",
"Sub-Total:" -> "NotSimilar", "Sub Total" -> "NotSimilar",
"Subtotal:" -> "NotSimilar", "" -> "NotSimilar",
"Total goods ex VAT" -> "NotSimilar", "Sub-Total" -> "NotSimilar",
"Net value of goods" -> "NotSimilar",
"Total Net Amount" -> "NotSimilar", "Net Total:" -> "NotSimilar",
"Sub-total" -> "NotSimilar", "Goods Value" -> "NotSimilar",
"Products" -> "NotSimilar", "SUB-TOTAL:" -> "NotSimilar",
"Net Amount" -> "NotSimilar", "Nett Total" -> "NotSimilar",
"Total Nett Goods" -> "NotSimilar",
"Invoice Total" -> "NotSimilar", "SUBTOTAL" -> "NotSimilar",
"Net total" -> "NotSimilar", "NETT TOTAL:" -> "NotSimilar",
"SUB TOTAL" -> "NotSimilar", "SUBTOTAL:" -> "NotSimilar",
"TOTAL GOODS EX VAT" -> "NotSimilar", "SUB-TOTAL" -> "NotSimilar",
"NET VALUE OF GOODS" -> "NotSimilar",
"TOTAL NET AMOUNT" -> "NotSimilar", "NET TOTAL:" -> "NotSimilar",
"GOODS VALUE" -> "NotSimilar", "PRODUCTS" -> "NotSimilar",
"NET AMOUNT" -> "NotSimilar", "NETT TOTAL" -> "NotSimilar",
"TOTAL NETT GOODS" -> "NotSimilar",
"INVOICE TOTAL" -> "NotSimilar", "NET TOTAL" -> "NotSimilar",
"subtotal" -> "NotSimilar", "nett total:" -> "NotSimilar",
"component total" -> "NotSimilar", "sub-total:" -> "NotSimilar",
"sub total" -> "NotSimilar", "subtotal:" -> "NotSimilar",
"total goods ex vat" -> "NotSimilar", "sub-total" -> "NotSimilar",
"net value of goods" -> "NotSimilar",
"total net amount" -> "NotSimilar", "net total:" -> "NotSimilar",
"goods value" -> "NotSimilar", "products" -> "NotSimilar",
"net amount" -> "NotSimilar", "nett total" -> "NotSimilar",
"total nett goods" -> "NotSimilar",
"invoice total" -> "NotSimilar", "net total" -> "NotSimilar",
"COMPONENT TOTAL:" -> "NotSimilar", "Sub Total:" -> "NotSimilar",
"Total goods ex VAT:" -> "NotSimilar",
"Net value of goods:" -> "NotSimilar",
"Total Net Amount:" -> "NotSimilar", "Sub-total:" -> "NotSimilar",
"Goods Value:" -> "NotSimilar", "Products:" -> "NotSimilar",
"Net Amount:" -> "NotSimilar", "Total Nett Goods:" -> "NotSimilar",
"Invoice Total:" -> "NotSimilar", "Net total:" -> "NotSimilar"};
(*Step 2:Train a Machine Learning Classifier*)
modelItemDesc = Classify[trainingDataItemDesc];
modelSubtot = Classify[trainingDataSubtot];
modelShip = Classify[trainingDataShip];
modelVAT = Classify[trainingDataVAT];
modelTotal = Classify[trainingDataTotal];
(*testPhrase="Another Random Example";
Print["Test phrase classification: ",modelSubtot[testPhrase]];*)
(*Step 1:Set Up the Word COM Interface*)
extractTextFromPDFUsingPDFtoText[pdfPath_String] :=
Module[{outputPath, result},
outputPath =
FileNameJoin[{DirectoryName[pdfPath],
FileBaseName[pdfPath] <> ".txt"}];
RunProcess[{"C:\\Poppler\\Library\\bin\\pdftotext.exe", "-layout",
pdfPath, outputPath}];
result = Import[outputPath, "Text"];
If[result === "", extractTextWithOCR[pdfPath], result]]
(*Step 2:Determine If OCR Is Required*)
requiresOCR[text_String] :=
StringLength[text] < 50 ||
StringMatchQ[text, RegularExpression["[^a-zA-Z0-9]+"],
IgnoreCase -> True]
(*Step 3:OCR Extraction Function*)
extractTextWithOCR[filename_String] := Module[{pages, ocrText},
(*Try to import the PDF as individual images*)
pages = Import[filename, {"PDF", "Pages"}];
(*If import fails,return a failure message*)
If[pages === $Failed, Return["$Failed"]];
(*Apply OCR to each page*)ocrText = TextRecognize /@ pages;
(*Combine the recognized text from all pages*)StringJoin[ocrText]]
(*Step 4:Combined Text Extraction Function*)
extractTextFromPDF[filename_String] :=
Module[{text},(*First,try to extract text using Microsoft Word*)
Print[filename];
text = Quiet[Check[extractTextFromPDFUsingPDFtoText[filename], ""]];
(*If the extracted text is empty or looks like gibberish,apply OCR*)
If[requiresOCR[text], Print["Requires OCR"];
text = extractTextWithOCR[filename]];
text]
(*Step 5:Search through all subfolders in a folder and extract \
all.pdf filenames*)
getAllPDFs[folder_String] := FileNames["*.pdf", folder, Infinity];
(*Specify the training and main folders where the PDFs are located*)
mainFolder =
"Z:\\Shared\\03 - Operations\\06 - Supply Chain\\Invoice Data \
Extraction\\Test";
Print["Getting a list of all PDF files in the folder and its \
subfolders..."];
(*Get a list of all PDF files in the folder and its subfolders*)
pdfFiles = getAllPDFs[mainFolder];
(*Step 5:Initialize a list to store the first entries in \
extractedNumbersSubtot for each file*)
firstEntriesItemDesc = {};
firstEntriesSubtot = {};
firstEntriesShip = {};
firstEntriesVAT = {};
firstEntriesTotal = {};
(*Step 6:Process each file in the folder*)
Do[
(*Load the file content*)
invoiceText = extractTextFromPDF[pdfFiles[[i]]];
(*Step 4:
Find positionsSubtot of phrases in the file content that match \
similar strings*)
allPositionsItemDesc = {};
allPositionsSubtot = {};
allPositionsShip = {};
allPositionsVAT = {};
allPositionsTotal = {};
Do[
(*Find positionsItemDesc of the current phrase in the file content*)
positionsItemDesc = StringPosition[invoiceText, phrase];
(*Classify the phrase if it's found in the text*)
If[Length[positionsItemDesc] > 0 &&
modelItemDesc[phrase] === "SimilarString",
(*Add all positionsItemDesc of this phrase to the list*)
allPositionsItemDesc =
Join[allPositionsItemDesc, positionsItemDesc]
],
{phrase, Keys[trainingDataItemDesc]}
];
Print[allPositionsItemDesc];
Do[
(*Find positionsSubtot of the current phrase in the file content*)
positionsSubtot = StringPosition[invoiceText, phrase];
(*Classify the phrase if it's found in the text*)
If[Length[positionsSubtot] > 0 &&
modelSubtot[phrase] === "SimilarString",
(*Add all positionsSubtot of this phrase to the list*)
allPositionsSubtot = Join[allPositionsSubtot, positionsSubtot]
],
{phrase, Keys[trainingDataSubtot]}
];
Do[
(*Find positionsSubtot of the current phrase in the file content*)
positionsShip = StringPosition[invoiceText, phrase];
(*Classify the phrase if it's found in the text*)
If[Length[positionsShip] > 0 &&
modelShip[phrase] === "SimilarString",
(*Add all positionsSubtot of this phrase to the list*)
allPositionsShip = Join[allPositionsShip, positionsShip]
],
{phrase, Keys[trainingDataShip]}
];
Do[
(*Find positionsVAT of the current phrase in the file content*)
positionsVAT = StringPosition[invoiceText, phrase];
(*Classify the phrase if it's found in the text*)
If[Length[positionsVAT] > 0 && modelVAT[phrase] === "SimilarString",
(*Add all positionsVAT of this phrase to the list*)
allPositionsVAT = Join[allPositionsVAT, positionsVAT]
],
{phrase, Keys[trainingDataVAT]}
];
Do[
(*Find positionsSubtot of the current phrase in the file content*)
positionsTotal = StringPosition[invoiceText, phrase];
(*Classify the phrase if it's found in the text*)
If[Length[positionsTotal] > 0 &&
modelTotal[phrase] === "SimilarString",
(*Add all positionsSubtot of this phrase to the list*)
allPositionsTotal = Join[allPositionsTotal, positionsTotal]
],
{phrase, Keys[trainingDataTotal]}
];
(*Step 4:Extract Numbers Next to Similar Strings*)
allItemDescriptions = {};
itemDescriptions = {};
extractedNumbersSubtot = {};
extractedNumbersShip = {};
extractedNumbersVAT = {};
extractedNumbersTotal = {};
numberPattern = RegularExpression["\\d{1,3}(,\\d{3})*(\\.\\d+)?"];
Do[
(*Print["positionsItemDesc dims = ",Dimensions[
positionsItemDesc]];*)
If[
Dimensions[positionsItemDesc] =!= {0},
pos =
Flatten[positionsItemDesc][[2]];(*Get the end positionsItemDesc \
of the match*)
(*Extract lines of text directly below the item description \
header*)
itemLines = StringSplit[StringDrop[invoiceText, pos], "\n"];
Print["itemLines Dims = ", Dimensions[itemLines]];
Print["Length[itemLines] = ", Length[itemLines]];
Print["Min[Length[itemLines]] = ", Min[Length[itemLines]]];
(*Ensure we handle empty lines*)
itemIndex = 1;
qtyIndex = 1;
(*Match the lines that are under the headers and collect them*)
Do[
If[StringLength[itemLines[[i]]] > 0,
Print["i = ", i];
itemLine = StringTrim[itemLines[[i]]];
Print["itemLine = ", itemLine];
AppendTo[itemDescriptions, itemLine];
],
{i, 1, Min[Length[itemLines]]}]
],
{positionsItemDesc, allPositionsItemDesc}
];
Do[
(*Print["positionSubtot dims = ",Dimensions[positionSubtot]];*)
If[
Dimensions[positionSubtot] =!= {0},
pos =
Flatten[positionSubtot][[2]];(*Get the end positionSubtot of the \
match*)
(*Extract the text after the similar string's positionSubtot*)
afterText = StringDrop[invoiceText, pos];
(*Find the first number following the similar string*)
number = StringCases[afterText, numberPattern, 1];
(*If a number is found,add it to the list of extracted numbers*)
If[number =!= {},
AppendTo[extractedNumbersSubtot, number[[1]]]
(*Print["No number found after positionSubtot: ",pos]*)
];
],
{positionSubtot, allPositionsSubtot}
];
Do[
If[
Dimensions[positionShip] =!= {0},
pos =
Flatten[positionShip][[2]];(*Get the end positionShip of the \
match*)
(*Extract the text after the similar string's positionShip*)
afterText = StringDrop[invoiceText, pos];
(*Find the first number following the similar string*)
number = StringCases[afterText, numberPattern, 1];
(*If a number is found,add it to the list of extracted numbers*)
If[number =!= {},
AppendTo[extractedNumbersShip, number[[1]]]
(*Print["No number found after positionShip: ",pos]*)
];
],
{positionShip, allPositionsShip}
];
Do[
If[
Dimensions[positionVAT] =!= {0},
pos =
Flatten[positionVAT][[2]];(*Get the end positionVAT of the match*)
\
(*Extract the text after the similar string's positionVAT*)
afterText = StringDrop[invoiceText, pos];
(*Find the first number following the similar string*)
number = StringCases[afterText, numberPattern, 1];
(*If a number is found,add it to the list of extracted numbers*)
If[number =!= {},
AppendTo[extractedNumbersVAT, number[[1]]]
(*Print["No number found after positionVAT: ",pos]*)
];
],
{positionVAT, allPositionsVAT}
];
Do[
If[
Dimensions[positionTotal] =!= {0},
pos =
Flatten[positionTotal][[2]];(*Get the end positionTotal of the \
match*)
(*Extract the text after the similar string's positionTotal*)
afterText = StringDrop[invoiceText, pos];
(*Find the first number following the similar string*)
number = StringCases[afterText, numberPattern, 1];
(*If a number is found,add it to the list of extracted numbers*)
If[number =!= {},
AppendTo[extractedNumbersTotal, number[[1]]]
(*Print["No number found after positionSubtot: ",pos]*)
];
],
{positionTotal, allPositionsTotal}
];
(*Step 6:
Flatten the list to handle any nested structure and get the first \
number*)
firstExtractedNumberSubtot =
If[Length[extractedNumbersSubtot] > 0,
Flatten[extractedNumbersSubtot][[1]], "No numbers found"];
firstExtractedNumberShip =
If[Length[extractedNumbersShip] > 0,
Flatten[extractedNumbersShip][[1]], "No numbers found"];
firstExtractedNumberVAT =
If[Length[extractedNumbersVAT] > 0,
Flatten[extractedNumbersVAT][[1]], "No numbers found"];
firstExtractedNumberTotal =
If[Length[extractedNumbersTotal] > 0,
Flatten[extractedNumbersTotal][[1]], "No numbers found"];
(*Step 10:Add the result to the firstEntriesSubtot list*)
AppendTo[allItemDescriptions, Flatten[itemDescriptions]];
AppendTo[firstEntriesSubtot, firstExtractedNumberSubtot];
AppendTo[firstEntriesShip, firstExtractedNumberShip];
AppendTo[firstEntriesVAT, firstExtractedNumberVAT];
AppendTo[firstEntriesTotal, firstExtractedNumberTotal],
{i, Length[pdfFiles]}
];
(*Final output:List of first extracted numbers for each file*)
Print["All item descriptions for each file: ", allItemDescriptions];
Print["First entries in extractedNumbersSubtot for each file: ",
firstEntriesSubtot];
Print["First entries in extractedNumbersShip for each file: ",
firstEntriesShip];
Print["First entries in extractedNumbersShip for each file: ",
firstEntriesVAT];
Print["First entries in extractedNumbersTotal for each file: ",
firstEntriesTotal];
The code is working well for the search strings which have the desired corresponding data to their right (trainingDataShip, trainingDataVAT, trainingDataTotal). The problems arise for the tabulated items where the desired data are below the search string headers (trainingDataItemDesc). After the invoices are converted to text format, the tables are lost so it is then not possible to extract all the data in the table cells below each header.
How to go about this?