updated module

Saroj Dhiman
Commit 160d8792 authored Nov 07, 2023 by Saroj Dhiman
Showing with 76 additions and 0 deletions
updated_model.py
--- a/updated_model.py
+++ b/updated_model.py
+from pdf2image import convert_from_path
+from PIL import Image
+import torch
+from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+import textract
+import os
+import time
+os.environ["TORCH_USE_CUDA_DSA"] = "true"
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+# CUDA_LAUNCH_BLOCKING="1"
+# Set the device
+DEVICE = "cuda"
+# DEVICE = "cpu"
+# DEVICE = "cuda:0"
+# Load the Pix2Struct model and processor
+model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
+processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
+def generate(img, questions):
+    inputs = processor(images=[img for _ in range(len(questions))],
+                      text=questions, return_tensors="pt").to(DEVICE)
+    predictions = model.generate(**inputs, max_new_tokens=256)
+    return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))
+def convert_pdf_to_images(filename):
+    images = convert_from_path(filename)
+    return images
+def extract_text_from_image(image):
+    # Use Textract to extract text from the image
+    text = textract.process(image)
+    return text.decode("utf-8")
+def process_pdf_to_image_and_text(filename, questions, rawtext_filename, image_filename, completions_filename):
+    images = convert_pdf_to_images(filename)
+    for page_no, image in enumerate(images, 1):
+        image.save(f"{image_filename}_page_{page_no}.png")
+        completions = generate(image, questions)
+        # Print the completions to the terminal
+        answers = []
+        for question, answer in completions:
+            answers.append(answer)
+            print(f"Question: {question}")
+            print(f"Answer: {answer}\n")
+        # Extract text from the image using Textract
+        extracted_text = extract_text_from_image(f"{image_filename}_page_{page_no}.png")
+        # Write the extracted text to a text file
+        with open(f"{rawtext_filename}_page_{page_no}.txt", "w") as output_file:
+            output_file.write(extracted_text)
+        # Write the completions to a text file
+        with open(completions_filename, "a") as output_file:
+            for question, answer in zip(questions, answers):
+                output_file.write(f"Page {page_no}\nQuestion: {question}\nAnswer: {answer}\n\n")
+filename = "invoice_123.pdf"
+ # Change to the desired page number
+questions = ["what is the name of company?",
+                 "what is the date of issue?",
+                 "What is the invoice number?",
+                 "What is the billed to address?",
+                 "what is company?"]  # Add your questions
+completions_filename = "out.txt"  # Provide the desired output text file name
+start = time.time()
+process_pdf_to_image_and_text(filename, questions, "raw.txt", "img.png", completions_filename)
+print("time taken " , ((time.time() - start) / 60))