cheeyeo · April 25, 2025 16:09
diff --git a/example.py b/example.py
 import json
 import io
 import numpy as np
 import torch
 from transformers import GroundingDinoForObjectDetection, GroundingDinoProcessor
 import triton_python_backend_utils as pb_utils
 from PIL import Image


 class TritonPythonModel:
    def initialize(self, args):
        self.logger = pb_utils.Logger
        self.model_config = json.loads(args["model_config"])
        self.model_params = self.model_config.get("parameters", {})

        default_hf_model = "IDEA-Research/grounding-dino-tiny"
        hf_model = self.model_params.get("huggingface_model", {}).get("string_value", default_hf_model)
        self.logger.log_info(f"Loading HuggingFace model: {hf_model}")

        default_box_threshold = 0.3
        default_text_threshold = 0.25
        self.box_threshold = float(self.model_params.get("box_threshold", {}).get("string_value", default_box_threshold))
        self.text_threshold = float(self.model_params.get("text_threshold", {}).get("string_value", default_text_threshold))

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.processor = GroundingDinoProcessor.from_pretrained(hf_model)
        self.model = GroundingDinoForObjectDetection.from_pretrained(hf_model).to(self.device)
    
    def execute(self, requests):
        responses = []
        for request in requests:
            input_tensor = pb_utils.get_input_tensor_by_name(request, "image_input")

            input_image = np.squeeze(input_tensor.as_numpy()).transpose((2, 0, 1))

            text = 'receipt.'
            inputs = self.processor(images=input_image, text=text, return_tensors="pt").to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs)
            
            self.logger.log_info(f"IMAGE SHAPE: {input_image.shape}")
            channels, height, width = input_image.shape
            results = self.processor.post_process_grounded_object_detection(
                outputs,
                inputs.input_ids,
                target_sizes=[(height, width)],
                box_threshold=self.box_threshold,
                text_threshold=self.text_threshold,
            )[0]

            self.logger.log_info(f"results from dino model: {results}")

            final_coords = []
            for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
                if round(score.item(), 2) < 0.6:
                    # TODO: Fail with exception here....
                    self.logger.log_info('Low confidence. Ignore')
                    continue
                
                box = [round(i, 1) for i in box.tolist()]
                self.logger.log_info(f"Detected {label} with confidence " f"{round(score.item(), 2)} at location {box}")
                x1, y1 = int(box[0]), int(box[1])
                x2, y2 = int(box[2]), int(box[3])
                coords = [x1, y1, x2, y2]
                # self.logger.log_info(f'COORDS: {x1, y1, x2, y2}')
                self.logger.log_info(f"COORDS: {coords}")
                final_coords.append(coords)

            tensor = pb_utils.Tensor("bounding_box", np.array(final_coords))

            inference_response = pb_utils.InferenceResponse(
                output_tensors=[tensor]
            )
            responses.append(inference_response)
        
        return responses
    
    def finalize(self):
        print("Cleaning up...")
	import json
	import io
	import numpy as np
	import torch
	from transformers import GroundingDinoForObjectDetection, GroundingDinoProcessor
	import triton_python_backend_utils as pb_utils
	from PIL import Image


	class TritonPythonModel:
	def initialize(self, args):
	self.logger = pb_utils.Logger
	self.model_config = json.loads(args["model_config"])
	self.model_params = self.model_config.get("parameters", {})

	default_hf_model = "IDEA-Research/grounding-dino-tiny"
	hf_model = self.model_params.get("huggingface_model", {}).get("string_value", default_hf_model)
	self.logger.log_info(f"Loading HuggingFace model: {hf_model}")

	default_box_threshold = 0.3
	default_text_threshold = 0.25
	self.box_threshold = float(self.model_params.get("box_threshold", {}).get("string_value", default_box_threshold))
	self.text_threshold = float(self.model_params.get("text_threshold", {}).get("string_value", default_text_threshold))

	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	self.processor = GroundingDinoProcessor.from_pretrained(hf_model)
	self.model = GroundingDinoForObjectDetection.from_pretrained(hf_model).to(self.device)

	def execute(self, requests):
	responses = []
	for request in requests:
	input_tensor = pb_utils.get_input_tensor_by_name(request, "image_input")

	input_image = np.squeeze(input_tensor.as_numpy()).transpose((2, 0, 1))

	text = 'receipt.'
	inputs = self.processor(images=input_image, text=text, return_tensors="pt").to(self.device)

	with torch.no_grad():
	outputs = self.model(**inputs)

	self.logger.log_info(f"IMAGE SHAPE: {input_image.shape}")
	channels, height, width = input_image.shape
	results = self.processor.post_process_grounded_object_detection(
	outputs,
	inputs.input_ids,
	target_sizes=[(height, width)],
	box_threshold=self.box_threshold,
	text_threshold=self.text_threshold,
	)[0]

	self.logger.log_info(f"results from dino model: {results}")

	final_coords = []
	for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
	if round(score.item(), 2) < 0.6:
	# TODO: Fail with exception here....
	self.logger.log_info('Low confidence. Ignore')
	continue

	box = [round(i, 1) for i in box.tolist()]
	self.logger.log_info(f"Detected {label} with confidence " f"{round(score.item(), 2)} at location {box}")
	x1, y1 = int(box[0]), int(box[1])
	x2, y2 = int(box[2]), int(box[3])
	coords = [x1, y1, x2, y2]
	# self.logger.log_info(f'COORDS: {x1, y1, x2, y2}')
	self.logger.log_info(f"COORDS: {coords}")
	final_coords.append(coords)

	tensor = pb_utils.Tensor("bounding_box", np.array(final_coords))

	inference_response = pb_utils.InferenceResponse(
	output_tensors=[tensor]
	)
	responses.append(inference_response)

	return responses

	def finalize(self):
	print("Cleaning up...")