from IPython.display import Image
display(Image(filename="images/1_process.png", width=600))
display(Image(filename="images/2_get_data.png", width=600))
display(Image(filename="images/3_create_corpus.png", width=600))
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, CharacterEmbeddings
from flair.visual.training_curves import Plotter
from flair.visual import Visualizer
from SAPInvoiceExtractor import InvoiceEntityExtractor,InvoiceNERModel
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
corpus_file_path=os.path.abspath('SAPInvoiceExtractor/Invoice_samples/corpus/custom_corpus/custom_corpus_consolidated.txt')
data_folder=os.path.abspath('SAPInvoiceExtractor/Invoice_samples/corpus/custom_corpus')
corpus_data = InvoiceNERModel.load_corpus(data_folder=data_folder,train_file=corpus_file_path)
2022-04-30 12:38:13,921 Reading data from f:\Master\PFM-G8-2022\Code\WebApp\backend\SAPInvoiceExtractor\Invoice_samples\corpus\custom_corpus 2022-04-30 12:38:13,922 Train: f:\Master\PFM-G8-2022\Code\WebApp\backend\SAPInvoiceExtractor\Invoice_samples\corpus\custom_corpus\custom_corpus_consolidated.txt 2022-04-30 12:38:13,922 Dev: None 2022-04-30 12:38:13,923 Test: None
all_sentences = corpus_data.get_all_sentences()
all_tokens = corpus_data._get_all_tokens()
print(f'Total de Muestras: {len(all_sentences)} - Entrenamiento: {len(corpus_data.train)} / Dev: {len(corpus_data.dev)} / Validación: {len(corpus_data.test)} / Total Tokens: {len(all_tokens)}')
Total de Muestras: 66 - Entrenamiento: 53 / Dev: 6 / Validación: 7 / Total Tokens: 11437
print(corpus_data.obtain_statistics())
{ "TRAIN": { "dataset": "TRAIN", "total_number_of_documents": 53, "number_of_documents_per_class": {}, "number_of_tokens_per_tag": {}, "number_of_tokens": { "total": 11437, "min": 113, "max": 459, "avg": 215.79245283018867 } }, "TEST": { "dataset": "TEST", "total_number_of_documents": 7, "number_of_documents_per_class": {}, "number_of_tokens_per_tag": {}, "number_of_tokens": { "total": 1462, "min": 119, "max": 453, "avg": 208.85714285714286 } }, "DEV": { "dataset": "DEV", "total_number_of_documents": 6, "number_of_documents_per_class": {}, "number_of_tokens_per_tag": {}, "number_of_tokens": { "total": 933, "min": 113, "max": 226, "avg": 155.5 } } }
tag_dist = corpus_data._count_token_labels(all_sentences,'ner')
tag_list = list(tag_dist.items())
tag_distribution_pd = pd.DataFrame(list(tag_dist.items()),columns = ['TAG','COUNT'])
print(tag_distribution_pd.sort_values(by=['COUNT'],ascending=False))
TAG COUNT 0 O 11379 6 I-PRODUCT_NAME 1182 5 B-PRODUCT_NAME 178 8 B-PRODUCT_PRICE 164 16 B-PRODUCT_REFERENCE 132 10 B-PRODUCT_TOTAL 132 7 B-PRODUCT_AMOUNT 128 1 B-PROVIDER 78 3 B-DATE 70 14 I-PROVIDER 69 2 B-REFERENCE 66 4 B-REQUEST 65 12 B-TOTAL 65 9 I-PRODUCT_PRICE 37 19 I-PRODUCT_REFERENCE 20 15 I-TOTAL 19 20 I-DATE 18 13 I-REFERENCE 12 11 I-PRODUCT_TOTAL 8 18 I-REQUEST 5 17 I-PRODUCT_AMOUNT 4
x = tag_distribution_pd['TAG']
y1 = tag_distribution_pd['COUNT']
sns.barplot(x=y1, y=x, palette="rocket")
<AxesSubplot:xlabel='COUNT', ylabel='TAG'>
corpus_data._get_most_common_tokens(max_tokens=20,min_freq=10)
[',', ':', 'de', 'la', '-', ')', '(', '€', 'DE', 'Barcelona', 'y', 'a', 'SL', 'S.L.', 'en', 'con', 'el', '08755', '%', 'que']
display(Image(filename="images/4_model_training.png", width=600))
from torchvision import models
from torchinfo import summary
from flair.models import SequenceTagger
model_loaded = SequenceTagger.load(model=os.path.abspath("sapinvoiceextractor/invoice_samples/NERModel/final-model.pt"))
summary(model_loaded)
2022-04-30 12:38:14,818 loading file f:\Master\PFM-G8-2022\Code\WebApp\backend\sapinvoiceextractor\invoice_samples\NERModel\final-model.pt
================================================================= Layer (type:depth-idx) Param # ================================================================= SequenceTagger -- ├─StackedEmbeddings: 1-1 -- │ └─CharacterEmbeddings: 2-1 -- │ │ └─Embedding: 3-1 6,875 │ │ └─LSTM: 3-2 10,400 │ └─FlairEmbeddings: 2-2 -- │ │ └─LanguageModel: 3-3 43,087,046 │ └─FlairEmbeddings: 2-3 -- │ │ └─LanguageModel: 3-4 43,087,046 ├─WordDropout: 1-2 -- ├─LockedDropout: 1-3 -- ├─Linear: 1-4 17,193,462 ├─LSTM: 1-5 9,019,392 ├─Linear: 1-6 12,312 ================================================================= Total params: 112,416,533 Trainable params: 112,416,533 Non-trainable params: 0 =================================================================
loss_file_path = os.path.abspath('SAPInvoiceExtractor/invoice_samples/NERModel/loss.tsv')
test_file_path = os.path.abspath('SAPInvoiceExtractor/invoice_samples/NERModel/test.tsv')
weights_file_path = os.path.abspath('SAPInvoiceExtractor/invoice_samples/NERModel/weights.txt')
learning_rate_file_path = os.path.abspath('SAPInvoiceExtractor/invoice_samples/NERModel/learning_rate.tsv')
plotter = Plotter()
# 4. initialize embedding stack with Flair and GloVe
embedding_types = [
CharacterEmbeddings(),
FlairEmbeddings('multi-forward'),
FlairEmbeddings('multi-backward')
]
embeddings = StackedEmbeddings(embeddings=embedding_types)
visualizer = Visualizer()
embeddings_html_path = os.path.abspath('SapInvoiceExtractor/invoice_samples/NerModel/embeddings.html')
visualizer.visualize_word_emeddings(embeddings, all_sentences, embeddings_html_path)
100%|██████████| 66/66 [04:57<00:00, 4.51s/it] C:\Users\squal\anaconda3\envs\pfm_dev_import_test_3\lib\site-packages\sklearn\manifold\_t_sne.py:780: FutureWarning: The default initialization in TSNE will change from 'random' to 'pca' in 1.2. warnings.warn( C:\Users\squal\anaconda3\envs\pfm_dev_import_test_3\lib\site-packages\sklearn\manifold\_t_sne.py:790: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2. warnings.warn(
[t-SNE] Computing 121 nearest neighbors... [t-SNE] Indexed 13832 samples in 0.036s... [t-SNE] Computed neighbors for 13832 samples in 10.071s... [t-SNE] Computed conditional probabilities for sample 1000 / 13832 [t-SNE] Computed conditional probabilities for sample 2000 / 13832 [t-SNE] Computed conditional probabilities for sample 3000 / 13832 [t-SNE] Computed conditional probabilities for sample 4000 / 13832 [t-SNE] Computed conditional probabilities for sample 5000 / 13832 [t-SNE] Computed conditional probabilities for sample 6000 / 13832 [t-SNE] Computed conditional probabilities for sample 7000 / 13832 [t-SNE] Computed conditional probabilities for sample 8000 / 13832 [t-SNE] Computed conditional probabilities for sample 9000 / 13832 [t-SNE] Computed conditional probabilities for sample 10000 / 13832 [t-SNE] Computed conditional probabilities for sample 11000 / 13832 [t-SNE] Computed conditional probabilities for sample 12000 / 13832 [t-SNE] Computed conditional probabilities for sample 13000 / 13832 [t-SNE] Computed conditional probabilities for sample 13832 / 13832 [t-SNE] Mean sigma: 2.437314 [t-SNE] KL divergence after 250 iterations with early exaggeration: 78.365097 [t-SNE] KL divergence after 300 iterations: 2.829407
plotter.plot_learning_rate(learning_rate_file_path)
learning_rate_data = pd.read_csv(learning_rate_file_path,delimiter='\t')
print(learning_rate_data.info())
Learning_rate plots are saved in f:\Master\PFM-G8-2022\Code\WebApp\backend\SAPInvoiceExtractor\invoice_samples\NERModel\learning_rate.png
<class 'pandas.core.frame.DataFrame'> Int64Index: 1000 entries, 1 to 1000 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ITERATION 1000 non-null float64 1 TIMESTAMP 1000 non-null float64 2 LEARNING_RATE 1000 non-null float64 3 TRAIN_LOSS 1000 non-null float64 dtypes: float64(4) memory usage: 39.1 KB None
plotter.plot_training_curves(loss_file_path)#test_file_path
Loss and F1 plots are saved in f:\Master\PFM-G8-2022\Code\WebApp\backend\SAPInvoiceExtractor\invoice_samples\NERModel\training.png
plotter.plot_weights(weights_file_path)
Weights plots are saved in f:\Master\PFM-G8-2022\Code\WebApp\backend\SAPInvoiceExtractor\invoice_samples\NERModel\weights.png
display(Image(filename="images/5_model_inference.png", width=600))
sample_invoice_file = os.path.abspath('SAPInvoiceExtractor/Invoice_samples/TIF/01_TIF_SPA_INSERTY_061219.tif')
budget = InvoiceEntityExtractor.process_file(sample_invoice_file)
01_TIF_SPA_INSERTY_061219.tif - spa 2022-04-30 12:43:47,929 loading file f:\Master\PFM-G8-2022\Code\WebApp\backend\SAPInvoiceExtractor\Invoice_Samples\NERModel\final-model.pt Inserty <I-PROVIDER> Instal-lacions <I-PROVIDER> , S.L . C / Progrés 7 , Pol . Ind . La Ferrería 08110 Montcada ¡ Reixac T 93 580 28 98 E insertyQinserty.net COMPAÑIA ESPAÑOLA DE LAMINACIÓN SL P.I . SAN VICENTE , C / FERRALLA 12 08755 CASTELLBISBAL BARCELONA NIF : B59559351 4300000583 Codigo de Proveedor : Pupirs CELSAJ . E ———— S / Ref : Pedido : 342128874 <B-REQUEST> — — ORT : 9189 -— 0 -= 1760 19 / 07 / 2019 N Pupitre Apilador4 * - 1,00 . SMT15001C <B-PRODUCT_REFERENCE> APC <B-PRODUCT_NAME> Smart-UPS <I-PRODUCT_NAME> 1500VA <I-PRODUCT_NAME> LCD <I-PRODUCT_NAME> 230V <I-PRODUCT_NAME> 1,00 <B-PRODUCT_AMOUNT> 526,000 <B-PRODUCT_PRICE> 526,000 <B-PRODUCT_TOTAL> 100-582 <B-PRODUCT_REFERENCE> Pasahilos <B-PRODUCT_NAME> horizontal <I-PRODUCT_NAME> 1UA <I-PRODUCT_NAME> 4 <I-PRODUCT_NAME> aros <I-PRODUCT_NAME> metálico <I-PRODUCT_NAME> negro <I-PRODUCT_NAME> 19 <I-PRODUCT_NAME> " <I-PRODUCT_NAME> 2,00 <B-PRODUCT_AMOUNT> 10,710 <B-PRODUCT_PRICE> 21,420 <B-PRODUCT_TOTAL> 542-023-BK <B-PRODUCT_REFERENCE> Bandeja <B-PRODUCT_NAME> fija <I-PRODUCT_NAME> 19 <I-PRODUCT_NAME> " <I-PRODUCT_NAME> F550 <I-PRODUCT_NAME> 2,00 <B-PRODUCT_AMOUNT> 28,030 <B-PRODUCT_PRICE> 56,060 <B-PRODUCT_TOTAL> 1009508-SI <B-PRODUCT_REFERENCE> Regleta <B-PRODUCT_NAME> 19 <I-PRODUCT_NAME> " <I-PRODUCT_NAME> 1UA <I-PRODUCT_NAME> aluminio <I-PRODUCT_NAME> 8 <I-PRODUCT_NAME> schukos <I-PRODUCT_NAME> con <I-PRODUCT_NAME> led <I-PRODUCT_NAME> indicador <I-PRODUCT_NAME> 1,00 <B-PRODUCT_AMOUNT> 29,760 <B-PRODUCT_PRICE> 29,760 <B-PRODUCT_TOTAL> 219585-2 <B-PRODUCT_REFERENCE> Cable <B-PRODUCT_NAME> UTP <I-PRODUCT_NAME> Cat.6 <I-PRODUCT_NAME> 4pares <I-PRODUCT_NAME> LSZH <I-PRODUCT_NAME> 4 <I-PRODUCT_NAME> 305,00 <B-PRODUCT_AMOUNT> 0,460 <B-PRODUCT_PRICE> 140,300 <B-PRODUCT_TOTAL> AMPP24 <B-PRODUCT_REFERENCE> / <I-PRODUCT_REFERENCE> 6UTP <B-PRODUCT_REFERENCE> Panel <B-PRODUCT_NAME> 19 <I-PRODUCT_NAME> " <I-PRODUCT_NAME> UTP <I-PRODUCT_NAME> 24 <I-PRODUCT_NAME> puertos <I-PRODUCT_NAME> RJ45 <I-PRODUCT_NAME> Cat.6 <I-PRODUCT_NAME> SL <I-PRODUCT_NAME> completo <I-PRODUCT_NAME> 1,00 <B-PRODUCT_AMOUNT> 96,310 <B-PRODUCT_PRICE> 96,310 <B-PRODUCT_TOTAL> NPCOSUZDB-WT001 <B-PRODUCT_REFERENCE> Latiguillo <B-PRODUCT_NAME> UTP <I-PRODUCT_NAME> Cat <I-PRODUCT_NAME> . <I-PRODUCT_NAME> 6 <I-PRODUCT_NAME> LSZH <I-PRODUCT_NAME> 1 <I-PRODUCT_NAME> mts <I-PRODUCT_NAME> . <I-PRODUCT_NAME> 12,00 <B-PRODUCT_AMOUNT> 3,310 <B-PRODUCT_PRICE> 39,720 <B-PRODUCT_TOTAL> NPCOSUZDB-WT002 <B-PRODUCT_REFERENCE> Latiguillo <B-PRODUCT_NAME> UTP <I-PRODUCT_NAME> Cat <I-PRODUCT_NAME> , <I-PRODUCT_NAME> 6 <I-PRODUCT_NAME> LSZH <I-PRODUCT_NAME> 2 <I-PRODUCT_NAME> mts <I-PRODUCT_NAME> . <I-PRODUCT_NAME> 12,00 <B-PRODUCT_AMOUNT> 3,960 <B-PRODUCT_PRICE> 47,520 <B-PRODUCT_TOTAL> 73071-2 <B-PRODUCT_REFERENCE> Canal <B-PRODUCT_NAME> blanco <I-PRODUCT_NAME> nieve <I-PRODUCT_NAME> 40x60 <I-PRODUCT_NAME> U23X <I-PRODUCT_NAME> 8,00 <B-PRODUCT_AMOUNT> 5,640 <B-PRODUCT_PRICE> 45,120 <B-PRODUCT_TOTAL> Tubo <B-PRODUCT_REFERENCE> flexible <B-PRODUCT_NAME> NYLOFYX <I-PRODUCT_NAME> ECT-36G <I-PRODUCT_NAME> DN36 <I-PRODUCT_NAME> PA <I-PRODUCT_NAME> GRIS <I-PRODUCT_NAME> 60,00 <B-PRODUCT_AMOUNT> 1,980 <B-PRODUCT_PRICE> 118,800 <B-PRODUCT_TOTAL> Pupitre <B-PRODUCT_NAME> Sierras <I-PRODUCT_NAME> 1,00 <B-PRODUCT_AMOUNT> 542-4768-GSBN-BK <B-PRODUCT_REFERENCE> — <B-PRODUCT_NAME> Rack <I-PRODUCT_NAME> ENVIRON <I-PRODUCT_NAME> CR <I-PRODUCT_NAME> 19 <I-PRODUCT_NAME> " <I-PRODUCT_NAME> 47UA <I-PRODUCT_NAME> 600x800 <I-PRODUCT_NAME> color <I-PRODUCT_NAME> negro <I-PRODUCT_NAME> 1,00 <B-PRODUCT_AMOUNT> 942,920 <B-PRODUCT_PRICE> 942,920 <B-PRODUCT_TOTAL> SMT15001C <B-PRODUCT_REFERENCE> APC <B-PRODUCT_NAME> Smart-UPS <I-PRODUCT_NAME> 1500VA <I-PRODUCT_NAME> LCD <I-PRODUCT_NAME> 230V <I-PRODUCT_NAME> 1,00 <B-PRODUCT_AMOUNT> 526,000 <B-PRODUCT_PRICE> “ <B-PRODUCT_TOTAL> 526,000 <B-PRODUCT_TOTAL> 100-582 <B-PRODUCT_REFERENCE> Pasahilos <B-PRODUCT_NAME> horizontal <I-PRODUCT_NAME> 1UA <I-PRODUCT_NAME> 4 <I-PRODUCT_NAME> aros <I-PRODUCT_NAME> metálico <I-PRODUCT_NAME> negro <I-PRODUCT_NAME> 19 <I-PRODUCT_NAME> " <I-PRODUCT_NAME> 2,00 <B-PRODUCT_AMOUNT> 10,710 <B-PRODUCT_PRICE> 21,420 <B-PRODUCT_TOTAL> 542-023-BK <B-PRODUCT_REFERENCE> Bandeja <B-PRODUCT_NAME> fija <I-PRODUCT_NAME> 19 <I-PRODUCT_NAME> " <I-PRODUCT_NAME> F550 <I-PRODUCT_NAME> 2,00 <B-PRODUCT_AMOUNT> 28,030 <B-PRODUCT_PRICE> 56,060 <B-PRODUCT_TOTAL> 1009508-SI <B-PRODUCT_REFERENCE> Regleta <B-PRODUCT_NAME> 19 <I-PRODUCT_NAME> " <I-PRODUCT_NAME> 1UA <I-PRODUCT_NAME> aluminio <I-PRODUCT_NAME> 8 <I-PRODUCT_NAME> schukos <I-PRODUCT_NAME> con <I-PRODUCT_NAME> led <I-PRODUCT_NAME> indicador <I-PRODUCT_NAME> 1,00 <B-PRODUCT_AMOUNT> 29,760 <B-PRODUCT_PRICE> 29,760 <B-PRODUCT_TOTAL> 219585-2 <B-PRODUCT_REFERENCE> Cable <B-PRODUCT_NAME> UTP <I-PRODUCT_NAME> Cat.6 <I-PRODUCT_NAME> 4pares <I-PRODUCT_NAME> LSZH <I-PRODUCT_NAME> 4 <I-PRODUCT_NAME> 305,00 <B-PRODUCT_AMOUNT> 0,460 <B-PRODUCT_PRICE> 140,300 <B-PRODUCT_TOTAL> AMPP24 <B-PRODUCT_REFERENCE> / <B-PRODUCT_NAME> 6UTP <I-PRODUCT_NAME> Panel <I-PRODUCT_NAME> 19 <I-PRODUCT_NAME> " <I-PRODUCT_NAME> UTP <I-PRODUCT_NAME> 24 <I-PRODUCT_NAME> puertos <I-PRODUCT_NAME> RJ45 <I-PRODUCT_NAME> Cat.6 <I-PRODUCT_NAME> SL <I-PRODUCT_NAME> completo <I-PRODUCT_NAME> 1,00 <B-PRODUCT_AMOUNT> 96,310 <B-PRODUCT_PRICE> 96,310 <B-PRODUCT_TOTAL> NPCOSUZDB-WT001 <B-PRODUCT_REFERENCE> Latiguillo <B-PRODUCT_NAME> UTP <I-PRODUCT_NAME> Cat <I-PRODUCT_NAME> . <I-PRODUCT_NAME> 6 <I-PRODUCT_NAME> LSZH <I-PRODUCT_NAME> 1 <I-PRODUCT_NAME> mts <I-PRODUCT_NAME> . <I-PRODUCT_NAME> 12,00 <B-PRODUCT_AMOUNT> 3,310 <B-PRODUCT_PRICE> 39,720 <B-PRODUCT_TOTAL> NPCO6UZDB-WT002 <B-PRODUCT_REFERENCE> Latiguillo <B-PRODUCT_NAME> UTP <I-PRODUCT_NAME> Cat <I-PRODUCT_NAME> . <I-PRODUCT_NAME> 6 <I-PRODUCT_NAME> LSZH <I-PRODUCT_NAME> 2 <I-PRODUCT_NAME> mts <I-PRODUCT_NAME> . <I-PRODUCT_NAME> 12,00 <B-PRODUCT_AMOUNT> 3,960 <B-PRODUCT_PRICE> 47,520 <B-PRODUCT_TOTAL> 73071-2 <B-PRODUCT_REFERENCE> Canal <B-PRODUCT_NAME> blanco <I-PRODUCT_NAME> nieve <I-PRODUCT_NAME> 40x60 <I-PRODUCT_NAME> U23X <I-PRODUCT_NAME> 6,00 <B-PRODUCT_AMOUNT> 5,640 <B-PRODUCT_PRICE> 33,840 <B-PRODUCT_TOTAL> Tubo <B-PRODUCT_REFERENCE> flexible <B-PRODUCT_NAME> NYLOFYX <I-PRODUCT_NAME> ECT-36G <I-PRODUCT_NAME> DN36 <I-PRODUCT_NAME> PA <I-PRODUCT_NAME> GRIS <I-PRODUCT_NAME> 60,00 <B-PRODUCT_AMOUNT> 1,980 <B-PRODUCT_PRICE> 118,800 <B-PRODUCT_TOTAL> — Suma : - . 9:173,660 Base del Iva 3.173,66 666,47 Cuota Dto / Rec Finan . > - Forma - de Cobro : T90 TRANSFERENCIA A 90 DIAS - | Vtos : 17 / 10 / 2019 - 3.840,13 <B-TOTAL> Eur Domiciliación : ES7901829758320100014388 INSERTY INSTALACIONES , S.L. es Responsable del tratamiento de los datos personales de conformidad con el GDPR , y serán tratados con la finalidad de mantener una relación comercial . Los mismos serán conservados mientras exista un interés mutuo para ello y no se comunicarán _ aterceros salvo obligación legal . Puede ejercer los derechos de acceso , rectificación , portabilidad , supresión , limitación y oposición en C / Progreso , Polígono Industrial La Ferrería , 7 - 08110 Montcada ¡ Reixac ( Barcelona ) . Email : victoria.fernandezQinserty.net , así como el de reclamación a la www.aepd.es Registro Mercantil de Barcelona , Tomo 36246 Gen.Folio 0198 , Hoja 282217 , Incripción 17 CIF B63321954 PRODUCTS - ^(\S+)\s+(.+)\s+((?:\d+[.,])?\d+[.,]\d+)\s+((?:\d+[.,])?\d+[.,]\d+)\s+((?:\d+[.,])?\d+[.,]\d+)$