Skip to content
Toggle navigation
Projects
Groups
Snippets
Help
Saroj Dhiman
/
invoice information extractor
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit 0fbaffe3
authored
Nov 02, 2023
by
Saroj Dhiman
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
final_project
0 parents
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
88 additions
and
0 deletions
information_ext.py
package_used/final_module.py
information_ext.py
0 → 100644
View file @
0fbaffe
from
new_module
import
final_module
questions
=
[
"what is the address of company?"
,
"what is the date of issue?"
,
"What is the invoice number?"
,
"What is the billed to address?,"
,
"what is company?"
]
rawtext_filenme
=
"extracted_text.txt"
image_filename
=
"output_image.png"
filename
=
"invoice_4.pdf"
final_module
.
process_pdf_to_image_and_text
(
"invoice_4.pdf"
,
1
,
questions
,
rawtext_filenme
,
image_filename
)
print
(
"PDF to image conversion and completion generation complete."
)
print
(
"check your folder"
)
#7241
package_used/final_module.py
0 → 100644
View file @
0fbaffe
from
pdf2image
import
convert_from_path
from
PIL
import
Image
import
torch
from
transformers
import
Pix2StructForConditionalGeneration
,
Pix2StructProcessor
import
textract
import
os
# Set the device
DEVICE
=
"cpu"
def
generate
(
model
,
processor
,
img
,
questions
):
inputs
=
processor
(
images
=
[
img
for
_
in
range
(
len
(
questions
))],
text
=
questions
,
return_tensors
=
"pt"
)
.
to
(
DEVICE
)
predictions
=
model
.
generate
(
**
inputs
,
max_new_tokens
=
256
)
return
zip
(
questions
,
processor
.
batch_decode
(
predictions
,
skip_special_tokens
=
True
))
def
convert_pdf_to_image
(
filename
,
page_no
,
image_filename
):
images
=
convert_from_path
(
filename
)
if
page_no
<
1
or
page_no
>
len
(
images
):
raise
ValueError
(
f
"Page {page_no} is out of range for the provided PDF."
)
image
=
images
[
page_no
-
1
]
image
.
save
(
image_filename
)
return
image
def
extract_text_from_image
(
image
):
# Use Textract to extract text from the image
text
=
textract
.
process
(
image
)
return
text
.
decode
(
"utf-8"
)
def
process_pdf_to_image_and_text
(
filename
,
page_no
,
questions
,
rawtext_filename
,
image_filename
):
# Load the Pix2Struct model and processor
model
=
Pix2StructForConditionalGeneration
.
from_pretrained
(
"google/pix2struct-docvqa-large"
)
.
to
(
DEVICE
)
processor
=
Pix2StructProcessor
.
from_pretrained
(
"google/pix2struct-docvqa-large"
)
image
=
convert_pdf_to_image
(
filename
,
page_no
,
image_filename
)
completions
=
generate
(
model
,
processor
,
image
,
questions
)
# Print the completions to the terminal
answers
=
[]
for
question
,
answer
in
completions
:
answers
.
append
(
answer
)
print
(
f
"Question: {question}"
)
print
(
f
"Answer: {answer}
\n
"
)
# Extract text from the image using Textract
extracted_text
=
extract_text_from_image
(
image_filename
)
# Write the extracted text to a text file
with
open
(
rawtext_filename
,
"w"
)
as
output_file
:
output_file
.
write
(
extracted_text
)
# Write the completions to a text file
completions_filename
=
"output_text.txt"
with
open
(
completions_filename
,
"w"
)
as
output_file
:
for
question
,
answer
in
zip
(
questions
,
answers
):
output_file
.
write
(
f
"Question: {question}
\n
Answer: {answer}
\n\n
"
)
return
completions_filename
# filename = "invoice_4.pdf"
# page_no = 1 # Change to the desired page number
# questions = ["what is the name of company?",
# "what is the date of issue?",
# "What is the invoice number?",
# "What is the billed to address?",
# "what is company?"] # Add your questions
# completions_filename = process_pdf_to_image_and_text(filename, page_no, questions, "out.txt", "img.png")
Write
Preview
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment