invoice_to_info_module.py

from pdf2image import convert_from_path
from PIL import Image
import torch
from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
import textract
import os

os.environ["TORCH_USE_CUDA_DSA"] = "true"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
# CUDA_LAUNCH_BLOCKING=1.

# Set the device
# DEVICE = "cuda"
# DEVICE = "cpu"
DEVICE = "cuda:1"

def generate(model, processor, img, questions):
    inputs = processor(images=[img for _ in range(len(questions))],
                      text=questions, return_tensors="pt").to(DEVICE)
    predictions = model.generate(**inputs, max_new_tokens=256)
    return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))

def convert_pdf_to_image(filename, page_no):
    images = convert_from_path(filename)
    if page_no < 1 or page_no > len(images):
        raise ValueError(f"Page {page_no} is out of range for the provided PDF.")

    image = images[page_no - 1]
    return image

def extract_text_from_image(image):
    # Use Textract to extract text from the image
    text = textract.process(image)
    return text.decode("utf-8")

def process_pdf_to_image_and_text(filename, page_no, questions, rawtext_filename, image_filename):
    # Load the Pix2Struct model and processor
    model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
    processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")

    image = convert_pdf_to_image(filename, page_no)
    image.save(image_filename)
    completions = generate(model, processor, image, questions)

    # Print the completions to the terminal
    answers = []
    for question, answer in completions:
        answers.append(answer)
        print(f"Question: {question}")
        print(f"Answer: {answer}\n")

    # Extract text from the image using Textract
    extracted_text = extract_text_from_image(image_filename)

    # Write the extracted text to a text file
    with open(rawtext_filename, "w") as output_file:
        output_file.write(extracted_text)


    # Write the completions to a text file
    completions_filename = "output_text.txt"
    with open(completions_filename, "w+") as output_file:
        for question, answer in zip(questions, answers):
            output_file.write(f"Question: {question}\nAnswer: {answer}\n\n")

    return completions_filename

#example
# filename = "new_02.pdf"
# page_no = 1  # Change to the desired page number
# questions = ["what is the name of company?"]
#                 #  "what is the date of issue?",
#                 #  "What is the invoice number?",
#                 #  "What is the billed to address?",
#                 #  "what is company?"]  # Add your questions

# completions_filename = process_pdf_to_image_and_text(filename, 1, questions, "out.txt", "img.png")


# from pdf2image import convert_from_path
# from PIL import Image
# import torch
# from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
# import pytesseract  # Import Pytesseract
# import os

# # os.environ["TORCH_USE_CUDA_DSA"] = "true"

# # Set the device
# DEVICE = "cpu"
# # DEVICE = "cuda:1"

# def generate(model, processor, img, questions):
#     inputs = processor(images=[img for _ in range(len(questions))],
#                       text=questions, return_tensors="pt").to(DEVICE)
#     predictions = model.generate(**inputs, max_new_tokens=256)
#     return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))

# def convert_pdf_to_image(filename, page_no):
#     images = convert_from_path(filename)
#     if page_no < 1 or page_no > len(images):
#         raise ValueError(f"Page {page_no} is out of range for the provided PDF.")

#     image = images[page_no - 1]
#     return image

# def extract_text_from_image(image):
#     # Use Pytesseract to extract text from the image
#     text = pytesseract.image_to_string(image, lang="eng")  # Specify language if needed
#     return text

# def process_pdf_to_image_and_text(filename, page_no, questions, rawtext_filename, image_filename):
#     # Load the Pix2Struct model and processor
#     model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
#     processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")

#     image = convert_pdf_to_image(filename, page_no)
#     image.save(image_filename)
#     completions = generate(model, processor, image, questions)
#     print(completions)

#     # Print the completions to the terminal
#     answers = []
#     for question, answer in completions:
#         answers.append(answer)
#         print(f"Question: {question}")
#         print(f"Answer: {answer}\n")

#     # Extract text from the image using Pytesseract
#     extracted_text = extract_text_from_image(image)

#     # Write the extracted text to a text file
#     with open(rawtext_filename, "w") as output_file:
#         output_file.write(extracted_text)

#     # Write the completions to a text file
#     completions_filename = "output_text.txt"
#     with open(completions_filename, "w+") as output_file:
#         for question, answer in zip(questions, answers):
#             output_file.write(f"Question: {question}\nAnswer: {answer}\n\n")

#     return completions_filename

# # Example usage
# filename = "invoice_9.pdf"
# page_no = 1  # Change to the desired page number
# questions = ["what is the name of company?",]
#             #      "what is the date of issue?",
#             # #    ]  # Add your questions

# completions_filename = process_pdf_to_image_and_text(filename, 1, questions, "out.txt", "img.png")