main_model.py

from pdf2image import convert_from_path
from PIL import Image
import torch
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
import textract
import os

# Set the device
DEVICE = "cpu"

def generate(model, processor, img, questions):
    inputs = processor(images=[img for _ in range(len(questions))],
                      text=questions, return_tensors="pt").to(DEVICE)
    predictions = model.generate(**inputs, max_new_tokens=256)
    return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))

def convert_pdf_to_image(filename, page_no, image_filename):
    images = convert_from_path(filename)
    if page_no < 1 or page_no > len(images):
        raise ValueError(f"Page {page_no} is out of range for the provided PDF.")

    image = images[page_no - 1]
    image.save(image_filename)
    return image

def extract_text_from_image(image):
    # Use Textract to extract text from the image
    text = textract.process(image)
    return text.decode("utf-8")

def process_pdf_to_image_and_text(filename, page_no, questions, rawtext_filename, image_filename):
    # Load the Pix2Struct model and processor
    model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
    processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")

    image = convert_pdf_to_image(filename, page_no, image_filename)
    completions = generate(model, processor, image, questions)

    # Print the completions to the terminal
    answers = []
    for question, answer in completions:
        answers.append(answer)
        print(f"Question: {question}")
        print(f"Answer: {answer}\n")

    # Extract text from the image using Textract
    extracted_text = extract_text_from_image(image_filename)

    # Write the extracted text to a text file
    with open(rawtext_filename, "w") as output_file:
        output_file.write(extracted_text)


    # Write the completions to a text file
    completions_filename = "output_text.txt"
    with open(completions_filename, "w") as output_file:
        for question, answer in zip(questions, answers):
            output_file.write(f"Question: {question}\nAnswer: {answer}\n\n")

    return completions_filename


# filename = "invoice_4.pdf"
# page_no = 1  # Change to the desired page number
# questions = ["what is the name of company?",
#                  "what is the date of issue?",
#                  "What is the invoice number?",
#                  "What is the billed to address?",
#                  "what is company?"]  # Add your questions

# completions_filename = process_pdf_to_image_and_text(filename, page_no, questions, "out.txt", "img.png")