Skip to content
Toggle navigation
Projects
Groups
Snippets
Help
Saroj Dhiman
/
invoice information extractor
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit 160d8792
authored
Nov 07, 2023
by
Saroj Dhiman
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
updated module
1 parent
a21b3715
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
76 additions
and
0 deletions
updated_model.py
updated_model.py
0 → 100644
View file @
160d879
from
pdf2image
import
convert_from_path
from
PIL
import
Image
import
torch
from
transformers
import
Pix2StructForConditionalGeneration
,
Pix2StructProcessor
import
textract
import
os
import
time
os
.
environ
[
"TORCH_USE_CUDA_DSA"
]
=
"true"
os
.
environ
[
"CUDA_LAUNCH_BLOCKING"
]
=
"1"
# CUDA_LAUNCH_BLOCKING="1"
# Set the device
DEVICE
=
"cuda"
# DEVICE = "cpu"
# DEVICE = "cuda:0"
# Load the Pix2Struct model and processor
model
=
Pix2StructForConditionalGeneration
.
from_pretrained
(
"google/pix2struct-docvqa-large"
)
.
to
(
DEVICE
)
processor
=
Pix2StructProcessor
.
from_pretrained
(
"google/pix2struct-docvqa-large"
)
def
generate
(
img
,
questions
):
inputs
=
processor
(
images
=
[
img
for
_
in
range
(
len
(
questions
))],
text
=
questions
,
return_tensors
=
"pt"
)
.
to
(
DEVICE
)
predictions
=
model
.
generate
(
**
inputs
,
max_new_tokens
=
256
)
return
zip
(
questions
,
processor
.
batch_decode
(
predictions
,
skip_special_tokens
=
True
))
def
convert_pdf_to_images
(
filename
):
images
=
convert_from_path
(
filename
)
return
images
def
extract_text_from_image
(
image
):
# Use Textract to extract text from the image
text
=
textract
.
process
(
image
)
return
text
.
decode
(
"utf-8"
)
def
process_pdf_to_image_and_text
(
filename
,
questions
,
rawtext_filename
,
image_filename
,
completions_filename
):
images
=
convert_pdf_to_images
(
filename
)
for
page_no
,
image
in
enumerate
(
images
,
1
):
image
.
save
(
f
"{image_filename}_page_{page_no}.png"
)
completions
=
generate
(
image
,
questions
)
# Print the completions to the terminal
answers
=
[]
for
question
,
answer
in
completions
:
answers
.
append
(
answer
)
print
(
f
"Question: {question}"
)
print
(
f
"Answer: {answer}
\n
"
)
# Extract text from the image using Textract
extracted_text
=
extract_text_from_image
(
f
"{image_filename}_page_{page_no}.png"
)
# Write the extracted text to a text file
with
open
(
f
"{rawtext_filename}_page_{page_no}.txt"
,
"w"
)
as
output_file
:
output_file
.
write
(
extracted_text
)
# Write the completions to a text file
with
open
(
completions_filename
,
"a"
)
as
output_file
:
for
question
,
answer
in
zip
(
questions
,
answers
):
output_file
.
write
(
f
"Page {page_no}
\n
Question: {question}
\n
Answer: {answer}
\n\n
"
)
filename
=
"invoice_123.pdf"
# Change to the desired page number
questions
=
[
"what is the name of company?"
,
"what is the date of issue?"
,
"What is the invoice number?"
,
"What is the billed to address?"
,
"what is company?"
]
# Add your questions
completions_filename
=
"out.txt"
# Provide the desired output text file name
start
=
time
.
time
()
process_pdf_to_image_and_text
(
filename
,
questions
,
"raw.txt"
,
"img.png"
,
completions_filename
)
print
(
"time taken "
,
((
time
.
time
()
-
start
)
/
60
))
Write
Preview
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment