Commit 45a79020 by Saroj Dhiman

This is the updated module in which i do some changes with pytressract library.

1 parent 843f4f6d
Showing with 151 additions and 0 deletions
from pdf2image import convert_from_path
from PIL import Image
import torch
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
import textract
import os
os.environ["TORCH_USE_CUDA_DSA"] = "true"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# CUDA_LAUNCH_BLOCKING=1.
# Set the device
# DEVICE = "cuda"
# DEVICE = "cpu"
DEVICE = "cuda:1"
def generate(model, processor, img, questions):
inputs = processor(images=[img for _ in range(len(questions))],
text=questions, return_tensors="pt").to(DEVICE)
predictions = model.generate(**inputs, max_new_tokens=256)
return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))
def convert_pdf_to_image(filename, page_no):
images = convert_from_path(filename)
if page_no < 1 or page_no > len(images):
raise ValueError(f"Page {page_no} is out of range for the provided PDF.")
image = images[page_no - 1]
return image
def extract_text_from_image(image):
# Use Textract to extract text from the image
text = textract.process(image)
return text.decode("utf-8")
def process_pdf_to_image_and_text(filename, page_no, questions, rawtext_filename, image_filename):
# Load the Pix2Struct model and processor
model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
image = convert_pdf_to_image(filename, page_no)
image.save(image_filename)
completions = generate(model, processor, image, questions)
# Print the completions to the terminal
answers = []
for question, answer in completions:
answers.append(answer)
print(f"Question: {question}")
print(f"Answer: {answer}\n")
# Extract text from the image using Textract
extracted_text = extract_text_from_image(image_filename)
# Write the extracted text to a text file
with open(rawtext_filename, "w") as output_file:
output_file.write(extracted_text)
# Write the completions to a text file
completions_filename = "output_text.txt"
with open(completions_filename, "w+") as output_file:
for question, answer in zip(questions, answers):
output_file.write(f"Question: {question}\nAnswer: {answer}\n\n")
return completions_filename
#example
# filename = "new_02.pdf"
# page_no = 1 # Change to the desired page number
# questions = ["what is the name of company?"]
# # "what is the date of issue?",
# # "What is the invoice number?",
# # "What is the billed to address?",
# # "what is company?"] # Add your questions
# completions_filename = process_pdf_to_image_and_text(filename, 1, questions, "out.txt", "img.png")
# from pdf2image import convert_from_path
# from PIL import Image
# import torch
# from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
# import pytesseract # Import Pytesseract
# import os
# # os.environ["TORCH_USE_CUDA_DSA"] = "true"
# # Set the device
# DEVICE = "cpu"
# # DEVICE = "cuda:1"
# def generate(model, processor, img, questions):
# inputs = processor(images=[img for _ in range(len(questions))],
# text=questions, return_tensors="pt").to(DEVICE)
# predictions = model.generate(**inputs, max_new_tokens=256)
# return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))
# def convert_pdf_to_image(filename, page_no):
# images = convert_from_path(filename)
# if page_no < 1 or page_no > len(images):
# raise ValueError(f"Page {page_no} is out of range for the provided PDF.")
# image = images[page_no - 1]
# return image
# def extract_text_from_image(image):
# # Use Pytesseract to extract text from the image
# text = pytesseract.image_to_string(image, lang="eng") # Specify language if needed
# return text
# def process_pdf_to_image_and_text(filename, page_no, questions, rawtext_filename, image_filename):
# # Load the Pix2Struct model and processor
# model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
# processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
# image = convert_pdf_to_image(filename, page_no)
# image.save(image_filename)
# completions = generate(model, processor, image, questions)
# print(completions)
# # Print the completions to the terminal
# answers = []
# for question, answer in completions:
# answers.append(answer)
# print(f"Question: {question}")
# print(f"Answer: {answer}\n")
# # Extract text from the image using Pytesseract
# extracted_text = extract_text_from_image(image)
# # Write the extracted text to a text file
# with open(rawtext_filename, "w") as output_file:
# output_file.write(extracted_text)
# # Write the completions to a text file
# completions_filename = "output_text.txt"
# with open(completions_filename, "w+") as output_file:
# for question, answer in zip(questions, answers):
# output_file.write(f"Question: {question}\nAnswer: {answer}\n\n")
# return completions_filename
# # Example usage
# filename = "invoice_9.pdf"
# page_no = 1 # Change to the desired page number
# questions = ["what is the name of company?",]
# # "what is the date of issue?",
# # # ] # Add your questions
# completions_filename = process_pdf_to_image_and_text(filename, 1, questions, "out.txt", "img.png")
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!