invoice_to_info_module.py
5.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from pdf2image import convert_from_path
from PIL import Image
import torch
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
import textract
import os
os.environ["TORCH_USE_CUDA_DSA"] = "true"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# CUDA_LAUNCH_BLOCKING=1.
# Set the device
# DEVICE = "cuda"
# DEVICE = "cpu"
DEVICE = "cuda:1"
def generate(model, processor, img, questions):
inputs = processor(images=[img for _ in range(len(questions))],
text=questions, return_tensors="pt").to(DEVICE)
predictions = model.generate(**inputs, max_new_tokens=256)
return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))
def convert_pdf_to_image(filename, page_no):
images = convert_from_path(filename)
if page_no < 1 or page_no > len(images):
raise ValueError(f"Page {page_no} is out of range for the provided PDF.")
image = images[page_no - 1]
return image
def extract_text_from_image(image):
# Use Textract to extract text from the image
text = textract.process(image)
return text.decode("utf-8")
def process_pdf_to_image_and_text(filename, page_no, questions, rawtext_filename, image_filename):
# Load the Pix2Struct model and processor
model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
image = convert_pdf_to_image(filename, page_no)
image.save(image_filename)
completions = generate(model, processor, image, questions)
# Print the completions to the terminal
answers = []
for question, answer in completions:
answers.append(answer)
print(f"Question: {question}")
print(f"Answer: {answer}\n")
# Extract text from the image using Textract
extracted_text = extract_text_from_image(image_filename)
# Write the extracted text to a text file
with open(rawtext_filename, "w") as output_file:
output_file.write(extracted_text)
# Write the completions to a text file
completions_filename = "output_text.txt"
with open(completions_filename, "w+") as output_file:
for question, answer in zip(questions, answers):
output_file.write(f"Question: {question}\nAnswer: {answer}\n\n")
return completions_filename
#example
# filename = "new_02.pdf"
# page_no = 1 # Change to the desired page number
# questions = ["what is the name of company?"]
# # "what is the date of issue?",
# # "What is the invoice number?",
# # "What is the billed to address?",
# # "what is company?"] # Add your questions
# completions_filename = process_pdf_to_image_and_text(filename, 1, questions, "out.txt", "img.png")
# from pdf2image import convert_from_path
# from PIL import Image
# import torch
# from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
# import pytesseract # Import Pytesseract
# import os
# # os.environ["TORCH_USE_CUDA_DSA"] = "true"
# # Set the device
# DEVICE = "cpu"
# # DEVICE = "cuda:1"
# def generate(model, processor, img, questions):
# inputs = processor(images=[img for _ in range(len(questions))],
# text=questions, return_tensors="pt").to(DEVICE)
# predictions = model.generate(**inputs, max_new_tokens=256)
# return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))
# def convert_pdf_to_image(filename, page_no):
# images = convert_from_path(filename)
# if page_no < 1 or page_no > len(images):
# raise ValueError(f"Page {page_no} is out of range for the provided PDF.")
# image = images[page_no - 1]
# return image
# def extract_text_from_image(image):
# # Use Pytesseract to extract text from the image
# text = pytesseract.image_to_string(image, lang="eng") # Specify language if needed
# return text
# def process_pdf_to_image_and_text(filename, page_no, questions, rawtext_filename, image_filename):
# # Load the Pix2Struct model and processor
# model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
# processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
# image = convert_pdf_to_image(filename, page_no)
# image.save(image_filename)
# completions = generate(model, processor, image, questions)
# print(completions)
# # Print the completions to the terminal
# answers = []
# for question, answer in completions:
# answers.append(answer)
# print(f"Question: {question}")
# print(f"Answer: {answer}\n")
# # Extract text from the image using Pytesseract
# extracted_text = extract_text_from_image(image)
# # Write the extracted text to a text file
# with open(rawtext_filename, "w") as output_file:
# output_file.write(extracted_text)
# # Write the completions to a text file
# completions_filename = "output_text.txt"
# with open(completions_filename, "w+") as output_file:
# for question, answer in zip(questions, answers):
# output_file.write(f"Question: {question}\nAnswer: {answer}\n\n")
# return completions_filename
# # Example usage
# filename = "invoice_9.pdf"
# page_no = 1 # Change to the desired page number
# questions = ["what is the name of company?",]
# # "what is the date of issue?",
# # # ] # Add your questions
# completions_filename = process_pdf_to_image_and_text(filename, 1, questions, "out.txt", "img.png")