final_project

Saroj Dhiman
Commit 0fbaffe3 authored Nov 02, 2023 by Saroj Dhiman
Showing with 88 additions and 0 deletions
information_ext.py
package_used/final_module.py
--- a/information_ext.py
+++ b/information_ext.py
+from new_module import final_module
+questions = ["what is the address of company?",
+                 "what is the date of issue?",
+                 "What is the invoice number?",
+                 "What is the billed to address?,",
+                 "what is company?"]
+
+rawtext_filenme = "extracted_text.txt"
+image_filename = "output_image.png"
+filename = "invoice_4.pdf"
+final_module.process_pdf_to_image_and_text("invoice_4.pdf",1,questions, rawtext_filenme, image_filename)
+print("PDF to image conversion and completion generation complete.")
+print("check your folder")
+#7241 
+
--- a/package_used/final_module.py
+++ b/package_used/final_module.py
+from pdf2image import convert_from_path
+from PIL import Image
+import torch
+from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+import textract
+import os
+
+# Set the device
+DEVICE = "cpu"
+
+def generate(model, processor, img, questions):
+    inputs = processor(images=[img for _ in range(len(questions))],
+                      text=questions, return_tensors="pt").to(DEVICE)
+    predictions = model.generate(**inputs, max_new_tokens=256)
+    return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))
+
+def convert_pdf_to_image(filename, page_no, image_filename):
+    images = convert_from_path(filename)
+    if page_no < 1 or page_no > len(images):
+        raise ValueError(f"Page {page_no} is out of range for the provided PDF.")
+
+    image = images[page_no - 1]
+    image.save(image_filename)
+    return image
+
+def extract_text_from_image(image):
+    # Use Textract to extract text from the image
+    text = textract.process(image)
+    return text.decode("utf-8")
+
+def process_pdf_to_image_and_text(filename, page_no, questions, rawtext_filename, image_filename):
+    # Load the Pix2Struct model and processor
+    model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
+    processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
+
+    image = convert_pdf_to_image(filename, page_no, image_filename)
+    completions = generate(model, processor, image, questions)
+
+    # Print the completions to the terminal
+    answers = []
+    for question, answer in completions:
+        answers.append(answer)
+        print(f"Question: {question}")
+        print(f"Answer: {answer}\n")
+
+    # Extract text from the image using Textract
+    extracted_text = extract_text_from_image(image_filename)
+
+    # Write the extracted text to a text file
+    with open(rawtext_filename, "w") as output_file:
+        output_file.write(extracted_text)
+
+
+    # Write the completions to a text file
+    completions_filename = "output_text.txt"
+    with open(completions_filename, "w") as output_file:
+        for question, answer in zip(questions, answers):
+            output_file.write(f"Question: {question}\nAnswer: {answer}\n\n")
+
+    return completions_filename
+
+
+# filename = "invoice_4.pdf"
+# page_no = 1  # Change to the desired page number
+# questions = ["what is the name of company?",
+#                  "what is the date of issue?",
+#                  "What is the invoice number?",
+#                  "What is the billed to address?",
+#                  "what is company?"]  # Add your questions
+
+# completions_filename = process_pdf_to_image_and_text(filename, page_no, questions, "out.txt", "img.png")
+
+