updated_model.py


from pdf2image import convert_from_path
from PIL import Image
import torch
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
import textract
import os
import time
os.environ["TORCH_USE_CUDA_DSA"] = "true"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# CUDA_LAUNCH_BLOCKING="1"

# Set the device
DEVICE = "cuda"
# DEVICE = "cpu"
# DEVICE = "cuda:0"
# Load the Pix2Struct model and processor
model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")


def generate(img, questions):
    inputs = processor(images=[img for _ in range(len(questions))],
                      text=questions, return_tensors="pt").to(DEVICE)
    predictions = model.generate(**inputs, max_new_tokens=256)
    return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))

def convert_pdf_to_images(filename):
    images = convert_from_path(filename)
    return images

def extract_text_from_image(image):
    # Use Textract to extract text from the image
    text = textract.process(image)
    return text.decode("utf-8")

def process_pdf_to_image_and_text(filename, questions, rawtext_filename, image_filename, completions_filename):

    images = convert_pdf_to_images(filename)

    for page_no, image in enumerate(images, 1):
        image.save(f"{image_filename}_page_{page_no}.png")
        completions = generate(image, questions)

        # Print the completions to the terminal
        answers = []
        for question, answer in completions:
            answers.append(answer)
            print(f"Question: {question}")
            print(f"Answer: {answer}\n")

        # Extract text from the image using Textract
        extracted_text = extract_text_from_image(f"{image_filename}_page_{page_no}.png")

        # Write the extracted text to a text file
        with open(f"{rawtext_filename}_page_{page_no}.txt", "w") as output_file:
            output_file.write(extracted_text)

        # Write the completions to a text file
        with open(completions_filename, "a") as output_file:
            for question, answer in zip(questions, answers):
                output_file.write(f"Page {page_no}\nQuestion: {question}\nAnswer: {answer}\n\n")

filename = "invoice_123.pdf"
 # Change to the desired page number
questions = ["what is the name of company?",
                 "what is the date of issue?",
                 "What is the invoice number?",
                 "What is the billed to address?",
                 "what is company?"]  # Add your questions

completions_filename = "out.txt"  # Provide the desired output text file name
start = time.time()
process_pdf_to_image_and_text(filename, questions, "raw.txt", "img.png", completions_filename)
print("time taken " , ((time.time() - start) / 60))