This is the updated module in which i do some changes with pytressract library.

Saroj Dhiman
Commit 45a79020 authored Nov 06, 2023 by Saroj Dhiman
Showing with 151 additions and 0 deletions
invoice_to_info_module.py
--- a/invoice_to_info_module.py
+++ b/invoice_to_info_module.py
+from pdf2image import convert_from_path
+from PIL import Image
+import torch
+from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+import textract
+import os
+os.environ["TORCH_USE_CUDA_DSA"] = "true"
+os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
+# CUDA_LAUNCH_BLOCKING=1.
+# Set the device
+# DEVICE = "cuda"
+# DEVICE = "cpu"
+DEVICE = "cuda:1"
+def generate(model, processor, img, questions):
+    inputs = processor(images=[img for _ in range(len(questions))],
+                      text=questions, return_tensors="pt").to(DEVICE)
+    predictions = model.generate(**inputs, max_new_tokens=256)
+    return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))
+def convert_pdf_to_image(filename, page_no):
+    images = convert_from_path(filename)
+    if page_no < 1 or page_no > len(images):
+        raise ValueError(f"Page {page_no} is out of range for the provided PDF.")
+    image = images[page_no - 1]
+    return image
+def extract_text_from_image(image):
+    # Use Textract to extract text from the image
+    text = textract.process(image)
+    return text.decode("utf-8")
+def process_pdf_to_image_and_text(filename, page_no, questions, rawtext_filename, image_filename):
+    # Load the Pix2Struct model and processor
+    model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
+    processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
+    image = convert_pdf_to_image(filename, page_no)
+    image.save(image_filename)
+    completions = generate(model, processor, image, questions)
+    # Print the completions to the terminal
+    answers = []
+    for question, answer in completions:
+        answers.append(answer)
+        print(f"Question: {question}")
+        print(f"Answer: {answer}\n")
+    # Extract text from the image using Textract
+    extracted_text = extract_text_from_image(image_filename)
+    # Write the extracted text to a text file
+    with open(rawtext_filename, "w") as output_file:
+        output_file.write(extracted_text)
+    # Write the completions to a text file
+    completions_filename = "output_text.txt"
+    with open(completions_filename, "w+") as output_file:
+        for question, answer in zip(questions, answers):
+            output_file.write(f"Question: {question}\nAnswer: {answer}\n\n")
+    return completions_filename
+#example
+# filename = "new_02.pdf"
+# page_no = 1  # Change to the desired page number
+# questions = ["what is the name of company?"]
+#                 #  "what is the date of issue?",
+#                 #  "What is the invoice number?",
+#                 #  "What is the billed to address?",
+#                 #  "what is company?"]  # Add your questions
+# completions_filename = process_pdf_to_image_and_text(filename, 1, questions, "out.txt", "img.png")
+# from pdf2image import convert_from_path
+# from PIL import Image
+# import torch
+# from transformers import Pix2StructForConditionalGeneration, Pix2StructProcessor
+# import pytesseract  # Import Pytesseract
+# import os
+# # os.environ["TORCH_USE_CUDA_DSA"] = "true"
+# # Set the device
+# DEVICE = "cpu"
+# # DEVICE = "cuda:1"
+# def generate(model, processor, img, questions):
+#     inputs = processor(images=[img for _ in range(len(questions))],
+#                       text=questions, return_tensors="pt").to(DEVICE)
+#     predictions = model.generate(**inputs, max_new_tokens=256)
+#     return zip(questions, processor.batch_decode(predictions, skip_special_tokens=True))
+# def convert_pdf_to_image(filename, page_no):
+#     images = convert_from_path(filename)
+#     if page_no < 1 or page_no > len(images):
+#         raise ValueError(f"Page {page_no} is out of range for the provided PDF.")
+#     image = images[page_no - 1]
+#     return image
+# def extract_text_from_image(image):
+#     # Use Pytesseract to extract text from the image
+#     text = pytesseract.image_to_string(image, lang="eng")  # Specify language if needed
+#     return text
+# def process_pdf_to_image_and_text(filename, page_no, questions, rawtext_filename, image_filename):
+#     # Load the Pix2Struct model and processor
+#     model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-docvqa-large").to(DEVICE)
+#     processor = Pix2StructProcessor.from_pretrained("google/pix2struct-docvqa-large")
+#     image = convert_pdf_to_image(filename, page_no)
+#     image.save(image_filename)
+#     completions = generate(model, processor, image, questions)
+#     print(completions)
+#     # Print the completions to the terminal
+#     answers = []
+#     for question, answer in completions:
+#         answers.append(answer)
+#         print(f"Question: {question}")
+#         print(f"Answer: {answer}\n")
+#     # Extract text from the image using Pytesseract
+#     extracted_text = extract_text_from_image(image)
+#     # Write the extracted text to a text file
+#     with open(rawtext_filename, "w") as output_file:
+#         output_file.write(extracted_text)
+#     # Write the completions to a text file
+#     completions_filename = "output_text.txt"
+#     with open(completions_filename, "w+") as output_file:
+#         for question, answer in zip(questions, answers):
+#             output_file.write(f"Question: {question}\nAnswer: {answer}\n\n")
+#     return completions_filename
+# # Example usage
+# filename = "invoice_9.pdf"
+# page_no = 1  # Change to the desired page number
+# questions = ["what is the name of company?",]
+#             #      "what is the date of issue?",
+#             # #    ]  # Add your questions
+# completions_filename = process_pdf_to_image_and_text(filename, 1, questions, "out.txt", "img.png")