Commit 160d8792 by Saroj Dhiman

updated module

1 parent a21b3715
Showing with 76 additions and 0 deletions
from pdf2image import convert_from_path
from PIL import Image
import torch
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
import textract
import os
import time
os.environ["TORCH_USE_CUDA_DSA"] = "true"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# CUDA_LAUNCH_BLOCKING="1"
# Set the device
DEVICE = "cuda"
# DEVICE = "cpu"
# DEVICE = "cuda:0"
# Load the Pix2Struct model and processor
model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
def generate(img, questions):
inputs = processor(images=[img for _ in range(len(questions))],
text=questions, return_tensors="pt").to(DEVICE)
predictions = model.generate(**inputs, max_new_tokens=256)
return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))
def convert_pdf_to_images(filename):
images = convert_from_path(filename)
return images
def extract_text_from_image(image):
# Use Textract to extract text from the image
text = textract.process(image)
return text.decode("utf-8")
def process_pdf_to_image_and_text(filename, questions, rawtext_filename, image_filename, completions_filename):
images = convert_pdf_to_images(filename)
for page_no, image in enumerate(images, 1):
image.save(f"{image_filename}_page_{page_no}.png")
completions = generate(image, questions)
# Print the completions to the terminal
answers = []
for question, answer in completions:
answers.append(answer)
print(f"Question: {question}")
print(f"Answer: {answer}\n")
# Extract text from the image using Textract
extracted_text = extract_text_from_image(f"{image_filename}_page_{page_no}.png")
# Write the extracted text to a text file
with open(f"{rawtext_filename}_page_{page_no}.txt", "w") as output_file:
output_file.write(extracted_text)
# Write the completions to a text file
with open(completions_filename, "a") as output_file:
for question, answer in zip(questions, answers):
output_file.write(f"Page {page_no}\nQuestion: {question}\nAnswer: {answer}\n\n")
filename = "invoice_123.pdf"
# Change to the desired page number
questions = ["what is the name of company?",
"what is the date of issue?",
"What is the invoice number?",
"What is the billed to address?",
"what is company?"] # Add your questions
completions_filename = "out.txt" # Provide the desired output text file name
start = time.time()
process_pdf_to_image_and_text(filename, questions, "raw.txt", "img.png", completions_filename)
print("time taken " , ((time.time() - start) / 60))
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!