readme done

2025-11-08 18:22:16 -06:00
parent 8059a72988
commit 3fcec25c26
4 changed files with 272 additions and 496438 deletions
--- a/README.md
+++ b/README.md
@@ -0,0 +1,95 @@
+# Hand-Raise Detection System
+
+A computer vision system that detects hand-raising gestures in videos using YOLOv8 object detection and pose estimation models.
+
+## Overview
+
+This system uses a two-stage approach:
+1. **Object Detection**: Detects people in the video frame using YOLOv8
+2. **Pose Estimation**: Analyzes upper body keypoints to determine if hands are raised
+
+The system outputs an annotated video with bounding boxes and skeleton overlays, plus JSON event logs with timestamps of hand-raise events.
+
+## Requirements
+
+```bash
+pip install ultralytics opencv-python torch
+```
+
+Supports Python 3.13+ with optional GPU acceleration via CUDA.
+
+## Usage
+
+```bash
+python hand_raise_detector.py [input_video.mp4]
+```
+
+**Outputs:**
+- `out.avi` - Annotated video with visual overlays
+- `events.json` - Structured event log
+- `events.ndjson` - Newline-delimited JSON events
+
+## Configuration
+
+Key parameters in the script:
+- `CONF_THRES`: Detection confidence threshold (default: 0.3)
+- `UPPER_BODY_RATIO`: Portion of person bbox to analyze (default: 0.5)
+- `SUSTAIN_FRAMES`: Required consecutive frames for valid detection (default: 15)
+
+## Development Challenges
+
+### 1. Color Space Encoding (BGR vs RGB)
+Getting the correct color encoding for streaming with Ultralytics required careful handling. OpenCV uses BGR format by default, while ML models expect RGB. Required `np.ascontiguousarray()` to ensure proper memory layout for model input.
+
+### 2. Upper Body Isolation
+**Not fully accomplished.** The current approach crops the top 50% of each detected person's bounding box, but this is a crude geometric approximation that doesn't account for pose variations, sitting vs standing positions, or semantic understanding of anatomy.
+
+### 3. Person ID Tracking
+**Major limitation.** The system assigns IDs based on detection order within each frame, but these IDs are **not consistent across frames**. A person detected first in one frame might be detected third in the next, causing event logs to incorrectly attribute hand-raises to wrong individuals.
+
+### 4. Handling Overlapping/Intersecting People
+When people overlap or stand close together, bounding boxes merge or occlude each other, pose keypoints may be assigned to the wrong person, and cropped regions may contain multiple people, confusing the pose model.
+
+### 5. Confidence Thresholds & Temporal Smoothing
+Finding optimal values required significant trial and error. These values are scene-dependent and may need tuning per video:
+- **Detection confidence** (0.3): Balance between detection rate and false positives
+- **Keypoint confidence** (0.4): Ensure reliable joint detections
+- **Sustain frames** (15): Balance between responsiveness and stability
+- **Vertical threshold** (0.15): Prevents "hands near face" false positives
+
+## Known Issues
+
+### False Positives:
+- Quick arm movements and stretching/yawning gestures
+- Occlusions causing inconsistent keypoint detections
+- Poor lighting reducing keypoint confidence
+- Crowded scenes confusing detection and pose models
+
+### False Negatives:
+- Partial hand-raises (elbow bent, hand not high enough)
+- Profile views with low keypoint visibility
+- Fast movements under 15 consecutive frames
+- Occlusions from furniture or other people
+
+## Possible Improvements
+
+### Quick Wins:
+1. **Add person tracking**: Use ByteTrack or DeepSORT for consistent IDs across frames
+2. **Improve upper body detection**: Use pose-based crops (shoulder-to-hip distance) instead of fixed ratio
+3. **Expose config as CLI args**: Enable per-video parameter tuning
+4. **Add temporal smoothing**: Moving average of hand positions over 3-5 frames
+5. **Implement hysteresis**: Prevent rapid on/off flickering with cooldown logic
+
+### Longer-Term Enhancements:
+1. **Multi-person pose estimation**: Single model pass for all people simultaneously
+2. **Action recognition models**: Use temporal models (SlowFast, X3D) instead of per-frame pose
+3. **Calibration mode**: Auto-tune thresholds using labeled ground truth clips
+4. **Intent classification**: Distinguish deliberate signals from casual gestures
+5. **End-to-end fine-tuning**: Train on hand-raise specific dataset
+
+## Performance
+
+- **Processing speed**: ~0.96x real-time on RTX 4090 (960ms to process 1 second of video)
+- **Memory usage**: ~2-4GB depending on video resolution
+- **Optimization opportunities**: Batch processing, frame skipping, async I/O, model quantization
+
--- a/events.json
+++ b/events.json
--- a/events.ndjson
+++ b/events.ndjson
--- a/main.py
+++ b/main.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 """
-Hand-raise detector using YOLOv8-pose (no MediaPipe).
+Hand-raise detector using YOLOv8 object detection + pose on upper body crops.

 Requirements:
  pip install ultralytics opencv-python torch
@@ -16,32 +16,25 @@ import threading
 import time
 from datetime import timedelta
 from collections import defaultdict
+
 import cv2
 import numpy as np
 import torch
 from ultralytics import YOLO

-
-# TODO: wtf is going on with the 35468 events
-
-
-
-# --------------------
+# ---------------------
 # Config
-# --------------------
+# ---------------------
 VIDEO_PATH = "input.mp4"
 OUT_PATH = "out.avi"
 EVENT_JSON = "events.json"
 EVENT_ND = "events.ndjson"

-MODEL_NAME = "yolov8m-pose.pt"
-IMG_SIZE = 1280  # Larger image size for better detection
-CONF_THRES = 0.05  # Very low threshold
-IOU = 0.45
-KP_CONF_THRES = 0.1  # Very low keypoint threshold
-SUSTAIN_FRAMES = 3
-DRAW_SKELETON = False
-TRACKER = True
+DETECT_MODEL = "yolov8n.pt"  # Fast object detection
+POSE_MODEL = "yolov8n-pose.pt"  # Pose estimation
+CONF_THRES = 0.3
+UPPER_BODY_RATIO = 0.5  # Use top 50% of detected person
+SUSTAIN_FRAMES = 15  # ~0.5 seconds at 30fps - more strict
 FRAME_QUEUE_SIZE = 8

 # ---------------------
@@ -57,41 +50,42 @@ def format_time(seconds: float) -> str:
        t = f"{sec}.{ms[:3].ljust(3,'0')}"
    return t

-def get_bbox_from_keypoints(kpts, width, height, conf_th=KP_CONF_THRES):
-    if kpts is None or kpts.size == 0:
-        return None
-    vis = kpts[:, 2] >= conf_th
-    if not vis.any():
-        return None
-    xs = (kpts[vis, 0] * width).astype(int)
-    ys = (kpts[vis, 1] * height).astype(int)
-    x1, y1, x2, y2 = int(xs.min()), int(ys.min()), int(xs.max()), int(ys.max())
-    pad_x = max(4, int(0.05 * (x2 - x1 + 1)))
-    pad_y = max(4, int(0.05 * (y2 - y1 + 1)))
-    return {"x": max(0, x1 - pad_x),
-            "y": max(0, y1 - pad_y),
-            "w": (x2 - x1 + 2*pad_x),
-            "h": (y2 - y1 + 2*pad_y)}
-
-def is_hand_raised(kpts, conf_th=KP_CONF_THRES):
+def is_hand_raised(kpts, conf_th=0.4):
+    """
+    Check if hand is raised with stricter criteria.
+    Keypoints: 5=left_shoulder, 6=right_shoulder, 7=left_elbow, 8=right_elbow, 9=left_wrist, 10=right_wrist
+    """
    if kpts is None or kpts.size == 0:
        return False
    try:
-        def check(s_idx, e_idx, w_idx):
-            s, e, w = kpts[s_idx], kpts[e_idx], kpts[w_idx]
-            # Only require wrist to be visible for partial bodies
-            if w[2] < conf_th:
+        def check_side(s_idx, e_idx, w_idx):
+            shoulder = kpts[s_idx]
+            elbow = kpts[e_idx]
+            wrist = kpts[w_idx]
+            
+            # All three points must be visible for reliable detection
+            if shoulder[2] < conf_th or elbow[2] < conf_th or wrist[2] < conf_th:
                return False
-            # If shoulder visible, check if wrist is above it
-            if s[2] >= conf_th:
-                return w[1] < s[1]
-            # If elbow visible, check if wrist is above it  
-            if e[2] >= conf_th:
-                return w[1] < e[1]
-            # If only wrist visible, check if it's in upper portion of frame
-            return w[1] < 0.4
-        left = check(5,7,9)
-        right = check(6,8,10)
+            
+            # Wrist must be significantly above shoulder (not just barely)
+            vertical_threshold = 0.15  # Wrist must be 15% of frame height above shoulder
+            if not (wrist[1] < shoulder[1] - vertical_threshold):
+                return False
+            
+            # Elbow should be between shoulder and wrist (arm is extended upward)
+            # This prevents detecting hands near face as "raised"
+            if not (shoulder[1] > elbow[1] > wrist[1]):
+                return False
+            
+            # Wrist and elbow should be roughly aligned horizontally (not reaching across body)
+            horizontal_distance = abs(wrist[0] - elbow[0])
+            if horizontal_distance > 0.2:  # Too far apart horizontally
+                return False
+            
+            return True
+        
+        left = check_side(5, 7, 9)   # left shoulder, elbow, wrist
+        right = check_side(6, 8, 10)  # right shoulder, elbow, wrist
        return left or right
    except:
        return False
@@ -132,7 +126,7 @@ def main(video_path=VIDEO_PATH):
    
    print(f"[INFO] Video: {width}x{height} @ {fps:.2f}fps, {total_frames} frames")

-    fourcc = cv2.VideoWriter_fourcc(*"XVID")  # XVID codec for AVI
+    fourcc = cv2.VideoWriter_fourcc(*"XVID")
    out = cv2.VideoWriter(OUT_PATH, fourcc, fps, (width, height))
    
    if not out.isOpened():
@@ -140,9 +134,13 @@ def main(video_path=VIDEO_PATH):

    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"[INFO] Using device: {device}")
-    model = YOLO(MODEL_NAME)
-    model.to(device)
-    print(f"[INFO] Model loaded: {MODEL_NAME}")
+    
+    # Load both models
+    detect_model = YOLO(DETECT_MODEL)
+    pose_model = YOLO(POSE_MODEL)
+    detect_model.to(device)
+    pose_model.to(device)
+    print(f"[INFO] Models loaded: {DETECT_MODEL} + {POSE_MODEL}")

    # Start frame reader thread
    fq = queue.Queue(maxsize=FRAME_QUEUE_SIZE)
@@ -155,7 +153,6 @@ def main(video_path=VIDEO_PATH):
    events = []
    frame_idx = 0
    last_progress = -1
-    saved_test_frame = False
    
    print("[INFO] Processing frames...")
    print("Progress: [" + " " * 50 + "] 0%", end="\r")
@@ -167,83 +164,138 @@ def main(video_path=VIDEO_PATH):
        
        orig_frame = frame.copy()
        
-        # Save a test frame to verify video is readable
-        if not saved_test_frame and frame_idx == 10:
-            cv2.imwrite("test_frame.jpg", frame)
-            print(f"\n[DEBUG] Saved test frame: {frame.shape}, dtype: {frame.dtype}")
-            saved_test_frame = True
+        # Step 1: Detect people using object detection
+        detect_results = detect_model.predict(frame, conf=CONF_THRES, classes=[0], verbose=False)
        
-        # Try detection with original BGR frame (YOLO can handle BGR)
-        results = model.predict(frame, imgsz=IMG_SIZE, conf=CONF_THRES, iou=IOU, verbose=False, classes=[0])
+        people_detected = 0
        
-        keypoints_list = []
-        ids = []
-        
-        # Debug: Check what YOLO detected
-        detections_found = False
-        for r in results:
-            # Check if ANY detections exist
-            if hasattr(r, 'boxes') and r.boxes is not None and len(r.boxes) > 0:
-                print(f"\n[DEBUG] Frame {frame_idx}: Found {len(r.boxes)} boxes", end="")
+        for det_result in detect_results:
+            if det_result.boxes is None or len(det_result.boxes) == 0:
+                continue
+                
+            boxes = det_result.boxes.xyxy.cpu().numpy()
+            people_detected = len(boxes)
            
-            if hasattr(r, "keypoints") and r.keypoints is not None:
-                try:
-                    kps = r.keypoints.data.cpu().numpy() if hasattr(r.keypoints.data, "cpu") else np.asarray(r.keypoints.data)
-                    print(f" | Keypoints shape: {kps.shape}", end="")
-                    if kps.ndim==3 and kps.shape[0] > 0:
-                        detections_found = True
-                        for i in range(kps.shape[0]):
-                            keypoints_list.append(kps[i].astype(np.float32))
-                            ids.append(i)
-                            print(f" | Person {i} keypoints: {np.sum(kps[i][:,2] > KP_CONF_THRES)}/17", end="")
-                except Exception as e:
-                    print(f" | Error: {e}", end="")
+            # Process each detected person
+            for pid, box in enumerate(boxes):
+                x1, y1, x2, y2 = map(int, box[:4])
+                
+                # Calculate upper body region (top portion of bbox)
+                person_height = y2 - y1
+                upper_body_height = int(person_height * UPPER_BODY_RATIO)
+                upper_y2 = y1 + upper_body_height
+                
+                # Expand bbox slightly for better context
+                margin = int(person_height * 0.1)
+                crop_x1 = max(0, x1 - margin)
+                crop_y1 = max(0, y1 - margin)
+                crop_x2 = min(width, x2 + margin)
+                crop_y2 = min(height, upper_y2 + margin)
+                
+                # Crop upper body region
+                upper_body_crop = frame[crop_y1:crop_y2, crop_x1:crop_x2]
+                
+                if upper_body_crop.size == 0:
                    continue
+                
+                # Step 2: Run pose estimation on cropped upper body
+                pose_results = pose_model.predict(upper_body_crop, conf=0.2, verbose=False)
+                
+                has_keypoints = False
+                for pose_result in pose_results:
+                    if not hasattr(pose_result, "keypoints") or pose_result.keypoints is None:
+                        continue
+                    
+                    kps_data = pose_result.keypoints.data.cpu().numpy() if hasattr(pose_result.keypoints.data, "cpu") else np.asarray(pose_result.keypoints.data)
+                    
+                    if kps_data.ndim == 3 and kps_data.shape[0] > 0:
+                        has_keypoints = True
+                        kpts = kps_data[0]  # Take first person in crop
+                        
+                        raised = is_hand_raised(kpts)
+                        prev = raised_state.get(pid, 0)
+                        counter = hold_counter.get(pid, 0)
+                        counter = counter + 1 if raised else 0
+                        sustained = counter >= SUSTAIN_FRAMES
+                        
+                        # Require cooldown period before allowing another raise event
+                        # This prevents flickering detections
+                        if not raised and prev == 1:
+                            counter = -5  # Cooldown for 5 frames
+                        
+                        raised_state[pid] = 1 if sustained else 0
+                        hold_counter[pid] = counter
+                        timestamp = format_time(frame_idx/fps)
+                        
+                        bbox = {"x": x1, "y": y1, "w": x2-x1, "h": y2-y1}
+
+                        if sustained and prev == 0:
+                            events.append({"id": pid, "event": "hand_raise_start", "frame": frame_idx, 
+                                         "time_seconds": round(frame_idx/fps, 3), "timestamp": timestamp, "bbox": bbox})
+                        if not sustained and prev == 1:
+                            events.append({"id": pid, "event": "hand_raise_end", "frame": frame_idx, 
+                                         "time_seconds": round(frame_idx/fps, 3), "timestamp": timestamp, "bbox": bbox})
+
+                        # Draw on original frame
+                        color = (0, 255, 0) if sustained else (255, 0, 0)
+                        thickness = 3 if sustained else 2
+                        cv2.rectangle(orig_frame, (x1, y1), (x2, y2), color, thickness)
+                        
+                        # Draw upper body region
+                        cv2.rectangle(orig_frame, (crop_x1, crop_y1), (crop_x2, crop_y2), (255, 255, 0), 1)
+                        
+                        # Draw skeleton on original frame
+                        # Convert keypoints from crop coordinates to original frame coordinates
+                        crop_height, crop_width = upper_body_crop.shape[:2]
+                        for i, kp in enumerate(kpts):
+                            if kp[2] > 0.3:  # If keypoint is visible
+                                # Convert from normalized coords to crop coords to frame coords
+                                kp_x = int(kp[0] * crop_width + crop_x1)
+                                kp_y = int(kp[1] * crop_height + crop_y1)
+                                cv2.circle(orig_frame, (kp_x, kp_y), 4, (0, 255, 255), -1)
+                        
+                        # Draw skeleton connections (upper body only)
+                        connections = [
+                            (5, 6),   # shoulders
+                            (5, 7),   # left shoulder to elbow
+                            (7, 9),   # left elbow to wrist
+                            (6, 8),   # right shoulder to elbow
+                            (8, 10),  # right elbow to wrist
+                            (5, 11),  # left shoulder to hip
+                            (6, 12),  # right shoulder to hip
+                            (11, 12), # hips
+                        ]
+                        
+                        for conn in connections:
+                            pt1_idx, pt2_idx = conn
+                            if kpts[pt1_idx][2] > 0.3 and kpts[pt2_idx][2] > 0.3:
+                                pt1_x = int(kpts[pt1_idx][0] * crop_width + crop_x1)
+                                pt1_y = int(kpts[pt1_idx][1] * crop_height + crop_y1)
+                                pt2_x = int(kpts[pt2_idx][0] * crop_width + crop_x1)
+                                pt2_y = int(kpts[pt2_idx][1] * crop_height + crop_y1)
+                                cv2.line(orig_frame, (pt1_x, pt1_y), (pt2_x, pt2_y), (0, 255, 255), 2)
+                        
+                        label = f"ID {pid} {'HAND UP' if sustained else 'detected'}"
+                        # Show counter for debugging
+                        if counter > 0:
+                            label += f" ({counter}/{SUSTAIN_FRAMES})"
+                        cv2.putText(orig_frame, label, (max(0, x1), max(20, y1-10)), 
+                                  cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2)
+                        
+                        # Show visible keypoints count
+                        visible_kpts = np.sum(kpts[:, 2] > 0.3)
+                        cv2.putText(orig_frame, f"KP: {visible_kpts}/17", (max(0, x1), max(40, y1+15)), 
+                                  cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
+                        break
        
-        # Debug logging every 30 frames
-        if frame_idx % 30 == 0 and frame_idx > 0:
-            print(f"\n[DEBUG] Frame {frame_idx}: Detected {len(keypoints_list)} people", end="")
-
-        for local_idx, kpts in enumerate(keypoints_list):
-            pid = ids[local_idx]
-            raised = is_hand_raised(kpts)
-            bbox = get_bbox_from_keypoints(kpts, width, height)
-            prev = raised_state.get(pid,0)
-            counter = hold_counter.get(pid,0)
-            counter = counter+1 if raised else 0
-            sustained = counter>=SUSTAIN_FRAMES
-            raised_state[pid] = 1 if sustained else 0
-            hold_counter[pid] = counter
-            timestamp = format_time(frame_idx/fps)
-
-            if sustained and prev==0:
-                events.append({"id":pid,"event":"hand_raise_start","frame":frame_idx,"time_seconds":round(frame_idx/fps,3),"timestamp":timestamp,"bbox":bbox})
-            if not sustained and prev==1:
-                events.append({"id":pid,"event":"hand_raise_end","frame":frame_idx,"time_seconds":round(frame_idx/fps,3),"timestamp":timestamp,"bbox":bbox})
-
-            # Always draw bounding box for detected people
-            if bbox is not None:
-                x,y,w,h = bbox["x"],bbox["y"],bbox["w"],bbox["h"]
-                color = (0,255,0) if sustained else (255,0,0)  # Green if hand up, Blue if not
-                thickness = 3 if sustained else 2
-                cv2.rectangle(orig_frame,(x,y),(x+w,y+h),color,thickness)
-                label = f"ID {pid} {'HAND UP' if sustained else 'detected'}"
-                cv2.putText(orig_frame,label,(max(0,x),max(20,y-10)),cv2.FONT_HERSHEY_SIMPLEX,0.7,color,2)
-
-        if DRAW_SKELETON:
-            try:
-                orig_frame = results[0].plot()
-            except:
-                pass
-        
-        # Always draw a frame counter for debugging
-        cv2.putText(orig_frame, f"Frame: {frame_idx} | People: {len(keypoints_list)}", 
+        # Frame counter
+        cv2.putText(orig_frame, f"Frame: {frame_idx} | People: {people_detected}", 
                    (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 255, 0), 2)

        out.write(orig_frame)
        frame_idx += 1
        
-    
+        # Update progress bar
        if total_frames > 0:
            progress = int((frame_idx / total_frames) * 100)
            if progress != last_progress:
@@ -257,14 +309,14 @@ def main(video_path=VIDEO_PATH):
    out.release()
    
    print(f"\n[INFO] Total frames processed: {frame_idx}")
-    print(f"[INFO] Total people detections: {sum(1 for e in events if 'start' in e['event'])}")
+    print(f"[INFO] Total hand raise events: {sum(1 for e in events if 'start' in e['event'])}")
    print("[INFO] Writing event logs...")

    # save events
    output = {"video_fps": fps, "events": events}
-    with open(EVENT_JSON,"w") as f:
-        json.dump(output,f,indent=2)
-    with open(EVENT_ND,"w") as f:
+    with open(EVENT_JSON, "w") as f:
+        json.dump(output, f, indent=2)
+    with open(EVENT_ND, "w") as f:
        for e in events:
            f.write(json.dumps(e)+"\n")

@@ -280,7 +332,7 @@ if __name__=="__main__":
    try:
        main(VIDEO_PATH)
    except Exception as e:
-        print("[FATAL]",str(e),file=sys.stderr)
+        print("[FATAL]", str(e), file=sys.stderr)
        sys.exit(1)
    finally:
        print(f"[TOTAL] elapsed {time.time()-start_t:.2f}s")