File size: 12,414 Bytes
7b04d4e
 
 
 
 
49a323c
7b04d4e
33fd6ad
75c2b7c
33fd6ad
771e08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18cd948
771e08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cddd79
771e08a
 
 
 
 
 
 
46e12d1
 
33fd6ad
1cddd79
 
f2ae346
1cddd79
5f3406b
 
 
 
 
771e08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f3406b
 
 
f2ae346
 
 
5f3406b
 
1cddd79
 
bda20be
46f4ca8
 
1cddd79
771e08a
1cddd79
bda20be
771e08a
46e12d1
771e08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bda20be
771e08a
 
 
46e12d1
771e08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bf83e0
 
18cd948
9bf83e0
46f4ca8
771e08a
46f4ca8
 
18cd948
771e08a
 
 
bd1163f
771e08a
bd1163f
 
771e08a
 
46e12d1
771e08a
 
bd1163f
 
 
771e08a
bd1163f
771e08a
 
 
bd1163f
771e08a
bd1163f
bda20be
 
9bf83e0
771e08a
9bf83e0
46e12d1
771e08a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46e12d1
771e08a
1cddd79
 
 
7e6153d
7b04d4e
1cddd79
b4f3ea6
46e12d1
1cddd79
18cd948
7b04d4e
b4f3ea6
b6ce847
49a323c
27eab0f
 
 
 
9fd1d46
27eab0f
33fd6ad
b4f3ea6
 
 
 
1cddd79
7b04d4e
bda20be
 
46e12d1
771e08a
 
bda20be
 
1cddd79
 
771e08a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
import gradio as gr
import cv2
import numpy as np
from groq import Groq
import time
from PIL import Image as PILImage
import io
import os
import base64

class SafetyMonitor:
    def __init__(self):
        self.client = Groq()
        self.model_name = "llama-3.2-90b-vision-preview"
        self.max_image_size = (800, 800)
        self.colors = [(0, 0, 255), (255, 0, 0), (0, 255, 0), (255, 255, 0), (255, 0, 255)]

    def preprocess_image(self, frame):
        """Prepare image for analysis."""
        if len(frame.shape) == 2:
            frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)
        elif len(frame.shape) == 3 and frame.shape[2] == 4:
            frame = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
        
        return self.resize_image(frame)

    def resize_image(self, image):
        """Resize image while maintaining aspect ratio."""
        height, width = image.shape[:2]
        if height > self.max_image_size[1] or width > self.max_image_size[0]:
            aspect = width / height
            if width > height:
                new_width = self.max_image_size[0]
                new_height = int(new_width / aspect)
            else:
                new_height = self.max_image_size[1]
                new_width = int(new_height * aspect)
            return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
        return image

    def encode_image(self, frame):
        """Convert image to base64 encoding."""
        frame_pil = PILImage.fromarray(frame)
        buffered = io.BytesIO()
        frame_pil.save(buffered, format="JPEG", quality=95)
        img_base64 = base64.b64encode(buffered.getvalue()).decode('utf-8')
        return f"data:image/jpeg;base64,{img_base64}"

    def get_scene_context(self, image: np.ndarray) -> str:
        """Get scene understanding to determine context."""
        try:
            image_url = self.encode_image(image)
            completion = self.client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": """Describe the key areas and elements visible in this construction/workplace image. Include:
                                1. Worker locations and activities
                                2. Equipment and machinery positions
                                3. Material storage or work areas
                                4. Environmental features
                                5. Access ways and pathways

                                Format as:
                                - Element: precise location description"""
                            },
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": image_url
                                }
                            }
                        ]
                    }
                ],
                temperature=0.3,
                max_tokens=200,
                stream=False
            )
            return completion.choices[0].message.content
        except Exception as e:
            print(f"Scene analysis error: {str(e)}")
            return ""

    def analyze_frame(self, frame: np.ndarray) -> tuple[str, dict]:
            """Analyze frame and return both safety analysis and scene context."""
            if frame is None:
                return "No frame received", {}
    
            # First get scene understanding
            scene_context = self.get_scene_context(frame)
            scene_regions = self.parse_scene_context(scene_context)
            
            # Then perform safety analysis with context
            frame = self.preprocess_image(frame)
            image_url = self.encode_image(frame)
            
            try:
                completion = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {
                                    "type": "text",
                                    "text": """Analyze this workplace image for safety concerns. For each identified hazard:
                                    1. Specify the exact location where the hazard exists
                                    2. Describe the specific safety issue
                                    3. Note any violations or risks
    
                                    Format each observation exactly as:
                                    - <location>area:detailed hazard description</location>
    
                                    Consider all safety aspects:
                                    - PPE compliance
                                    - Ergonomic risks
                                    - Equipment safety
                                    - Environmental hazards
                                    - Material handling
                                    - Access/egress
                                    - Work procedures
                                    """
                                },
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": image_url
                                    }
                                }
                            ]
                        }
                    ],
                    temperature=0.5,
                    max_tokens=500,
                    stream=False
                )
                return completion.choices[0].message.content, scene_regions
            except Exception as e:
                print(f"Analysis error: {str(e)}")
                return f"Analysis Error: {str(e)}", scene_regions

        def parse_scene_context(self, context: str) -> dict:
            """Parse scene context to get region mapping."""
            regions = {}
            for line in context.split('\n'):
                if line.strip().startswith('-'):
                    parts = line.strip('- ').split(':')
                    if len(parts) == 2:
                        element_type = parts[0].strip()
                        location = parts[1].strip()
                        regions[element_type] = location
            return regions
    
        def get_region_coordinates(self, location: str, image_shape: tuple) -> tuple:
            """Convert location description to coordinates."""
            height, width = image_shape[:2]
            
            # Parse location description for spatial information
            location = location.lower()
            x1, y1, x2, y2 = 0, 0, width, height  # Default to full image
            
            # Horizontal position
            if 'left' in location:
                x2 = width // 2
            elif 'right' in location:
                x1 = width // 2
            elif 'center' in location:
                x1 = width // 4
                x2 = 3 * width // 4
                
            # Vertical position
            if 'top' in location:
                y2 = height // 2
            elif 'bottom' in location:
                y1 = height // 2
            elif 'middle' in location or 'center' in location:
                y1 = height // 4
                y2 = 3 * height // 4
                
            return (x1, y1, x2, y2)
    
        def draw_observations(self, image: np.ndarray, observations: list, scene_regions: dict) -> np.ndarray:
            """Draw safety observations using scene context."""
            height, width = image.shape[:2]
            font = cv2.FONT_HERSHEY_SIMPLEX
            font_scale = 0.5
            thickness = 2
            padding = 10
    
            for idx, obs in enumerate(observations):
                color = self.colors[idx % len(self.colors)]
                
                # Find best matching region from scene context or parse location directly
                location = obs['location'].lower()
                x1, y1, x2, y2 = self.get_region_coordinates(location, image.shape)
                
                # Draw observation box
                cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)
                
                # Add label
                label = obs['description'][:50] + "..." if len(obs['description']) > 50 else obs['description']
                label_size, _ = cv2.getTextSize(label, font, font_scale, thickness)
                
                # Position text above the box
                text_x = max(0, x1)
                text_y = max(label_size[1] + padding, y1 - padding)
                
                # Draw text background
                cv2.rectangle(image, 
                             (text_x, text_y - label_size[1] - padding),
                             (text_x + label_size[0] + padding, text_y),
                             color, -1)
                
                # Draw text
                cv2.putText(image, label,
                           (text_x + padding//2, text_y - padding//2),
                           font, font_scale, (255, 255, 255), thickness)
            
            return image          

        def process_frame(self, frame: np.ndarray) -> tuple[np.ndarray, str]:
                """Process frame with safety analysis and visualization."""
                if frame is None:
                    return None, "No image provided"
                
                # Get analysis and scene context
                analysis, scene_regions = self.analyze_frame(frame)
                display_frame = frame.copy()
                
                # Parse observations
                observations = []
                for line in analysis.split('\n'):
                    line = line.strip()
                    if line.startswith('-') and '<location>' in line and '</location>' in line:
                        start = line.find('<location>') + len('<location>')
                        end = line.find('</location>')
                        location_description = line[start:end].strip()
                        
                        if ':' in location_description:
                            location, description = location_description.split(':', 1)
                            observations.append({
                                'location': location.strip(),
                                'description': description.strip()
                            })
                
                # Draw observations if any were found
                if observations:
                    annotated_frame = self.draw_observations(display_frame, observations, scene_regions)
                    return annotated_frame, analysis
                
                return display_frame, analysis

def create_monitor_interface():
    monitor = SafetyMonitor()
    
    with gr.Blocks() as demo:
        gr.Markdown("# Safety Analysis System powered by Llama 3.2 90b vision")
        
        with gr.Row():
            input_image = gr.Image(label="Upload Image")
            output_image = gr.Image(label="Safety Analysis")
        
        analysis_text = gr.Textbox(label="Detailed Analysis", lines=5)
            
        def analyze_image(image):
            if image is None:
                return None, "No image provided"
            try:
                processed_frame, analysis = monitor.process_frame(image)
                return processed_frame, analysis
            except Exception as e:
                print(f"Processing error: {str(e)}")
                return None, f"Error processing image: {str(e)}"
            
        input_image.change(
            fn=analyze_image,
            inputs=input_image,
            outputs=[output_image, analysis_text]
        )

        gr.Markdown("""
        ## Instructions:
        1. Upload any workplace/safety-related image
        2. View identified hazards and their locations
        3. Read detailed analysis of safety concerns
        """)

    return demo

if __name__ == "__main__":
    demo = create_monitor_interface()
    demo.launch()