Skip to content
Toggle navigation
Projects
Groups
Snippets
Help
Saroj Dhiman
/
invoice information extractor
This project
Loading...
Sign in
Toggle navigation
Go to a project
Project
Repository
Issues
0
Merge Requests
0
Pipelines
Wiki
Snippets
Settings
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Commit 45a79020
authored
Nov 06, 2023
by
Saroj Dhiman
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
This is the updated module in which i do some changes with pytressract library.
1 parent
843f4f6d
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
151 additions
and
0 deletions
invoice_to_info_module.py
invoice_to_info_module.py
0 → 100644
View file @
45a7902
from
pdf2image
import
convert_from_path
from
PIL
import
Image
import
torch
from
transformers
import
Pix2StructForConditionalGeneration
,
Pix2StructProcessor
import
textract
import
os
os
.
environ
[
"TORCH_USE_CUDA_DSA"
]
=
"true"
os
.
environ
[
"CUDA_LAUNCH_BLOCKING"
]
=
"1"
# CUDA_LAUNCH_BLOCKING=1.
# Set the device
# DEVICE = "cuda"
# DEVICE = "cpu"
DEVICE
=
"cuda:1"
def
generate
(
model
,
processor
,
img
,
questions
):
inputs
=
processor
(
images
=
[
img
for
_
in
range
(
len
(
questions
))],
text
=
questions
,
return_tensors
=
"pt"
)
.
to
(
DEVICE
)
predictions
=
model
.
generate
(
**
inputs
,
max_new_tokens
=
256
)
return
zip
(
questions
,
processor
.
batch_decode
(
predictions
,
skip_special_tokens
=
True
))
def
convert_pdf_to_image
(
filename
,
page_no
):
images
=
convert_from_path
(
filename
)
if
page_no
<
1
or
page_no
>
len
(
images
):
raise
ValueError
(
f
"Page {page_no} is out of range for the provided PDF."
)
image
=
images
[
page_no
-
1
]
return
image
def
extract_text_from_image
(
image
):
# Use Textract to extract text from the image
text
=
textract
.
process
(
image
)
return
text
.
decode
(
"utf-8"
)
def
process_pdf_to_image_and_text
(
filename
,
page_no
,
questions
,
rawtext_filename
,
image_filename
):
# Load the Pix2Struct model and processor
model
=
Pix2StructForConditionalGeneration
.
from_pretrained
(
"google/pix2struct-docvqa-large"
)
.
to
(
DEVICE
)
processor
=
Pix2StructProcessor
.
from_pretrained
(
"google/pix2struct-docvqa-large"
)
image
=
convert_pdf_to_image
(
filename
,
page_no
)
image
.
save
(
image_filename
)
completions
=
generate
(
model
,
processor
,
image
,
questions
)
# Print the completions to the terminal
answers
=
[]
for
question
,
answer
in
completions
:
answers
.
append
(
answer
)
print
(
f
"Question: {question}"
)
print
(
f
"Answer: {answer}
\n
"
)
# Extract text from the image using Textract
extracted_text
=
extract_text_from_image
(
image_filename
)
# Write the extracted text to a text file
with
open
(
rawtext_filename
,
"w"
)
as
output_file
:
output_file
.
write
(
extracted_text
)
# Write the completions to a text file
completions_filename
=
"output_text.txt"
with
open
(
completions_filename
,
"w+"
)
as
output_file
:
for
question
,
answer
in
zip
(
questions
,
answers
):
output_file
.
write
(
f
"Question: {question}
\n
Answer: {answer}
\n\n
"
)
return
completions_filename
#example
# filename = "new_02.pdf"
# page_no = 1 # Change to the desired page number
# questions = ["what is the name of company?"]
# # "what is the date of issue?",
# # "What is the invoice number?",
# # "What is the billed to address?",
# # "what is company?"] # Add your questions
# completions_filename = process_pdf_to_image_and_text(filename, 1, questions, "out.txt", "img.png")
# from pdf2image import convert_from_path
# from PIL import Image
# import torch
# from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
# import pytesseract # Import Pytesseract
# import os
# # os.environ["TORCH_USE_CUDA_DSA"] = "true"
# # Set the device
# DEVICE = "cpu"
# # DEVICE = "cuda:1"
# def generate(model, processor, img, questions):
# inputs = processor(images=[img for _ in range(len(questions))],
# text=questions, return_tensors="pt").to(DEVICE)
# predictions = model.generate(**inputs, max_new_tokens=256)
# return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))
# def convert_pdf_to_image(filename, page_no):
# images = convert_from_path(filename)
# if page_no < 1 or page_no > len(images):
# raise ValueError(f"Page {page_no} is out of range for the provided PDF.")
# image = images[page_no - 1]
# return image
# def extract_text_from_image(image):
# # Use Pytesseract to extract text from the image
# text = pytesseract.image_to_string(image, lang="eng") # Specify language if needed
# return text
# def process_pdf_to_image_and_text(filename, page_no, questions, rawtext_filename, image_filename):
# # Load the Pix2Struct model and processor
# model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
# processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
# image = convert_pdf_to_image(filename, page_no)
# image.save(image_filename)
# completions = generate(model, processor, image, questions)
# print(completions)
# # Print the completions to the terminal
# answers = []
# for question, answer in completions:
# answers.append(answer)
# print(f"Question: {question}")
# print(f"Answer: {answer}\n")
# # Extract text from the image using Pytesseract
# extracted_text = extract_text_from_image(image)
# # Write the extracted text to a text file
# with open(rawtext_filename, "w") as output_file:
# output_file.write(extracted_text)
# # Write the completions to a text file
# completions_filename = "output_text.txt"
# with open(completions_filename, "w+") as output_file:
# for question, answer in zip(questions, answers):
# output_file.write(f"Question: {question}\nAnswer: {answer}\n\n")
# return completions_filename
# # Example usage
# filename = "invoice_9.pdf"
# page_no = 1 # Change to the desired page number
# questions = ["what is the name of company?",]
# # "what is the date of issue?",
# # # ] # Add your questions
# completions_filename = process_pdf_to_image_and_text(filename, 1, questions, "out.txt", "img.png")
Write
Preview
Markdown
is supported
Attach a file
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to post a comment