Moondream Recipes
Explore real-world examples and implementation patterns using Moondream...
Content Moderation
Object Tracking
DeepSORT
Moondream2-2025-01-09
Promptable Content Moderation
A powerful content moderation tool that uses Moondream 2B to detect and moderate content in videos using natural language prompts.
Key Features
๐ฏ
Natural Language Prompts
Define content to moderate using natural language descriptions, leveraging Moondream's vision-language capabilities for flexible content detection.
๐
Intelligent Scene Detection
Advanced scene detection with DeepSORT tracking and automatic tracker reset at scene boundaries for improved accuracy.
๐จ
Multiple Visualization Styles
Choose between different redaction styles including obfuscated-pixel, bounding boxes, and hitmarkers for detected content.
โก
Optimized Processing
Grid-based detection for complex scenes, frame-by-frame processing with IoU-based merging, and GPU acceleration support.
Code Samples
promptable-content-moderation.py
#!/usr/bin/env python3
import cv2, os, subprocess, argparse
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, SamModel, SamProcessor
from tqdm import tqdm
import numpy as np
from datetime import datetime
import colorsys
import random
from deep_sort_integration import DeepSORTTracker
from scenedetect import detect, ContentDetector
from functools import lru_cache
# Constants
DEFAULT_TEST_MODE_DURATION = 3 # Process only first 3 seconds in test mode by default
def load_moondream():
"""Load Moondream model and tokenizer."""
model = AutoModelForCausalLM.from_pretrained(
"vikhyatk/moondream2", trust_remote_code=True, device_map={"": "cuda"}
)
tokenizer = AutoTokenizer.from_pretrained("vikhyatk/moondream2")
return model, tokenizer
def get_video_properties(video_path):
"""Get basic video properties."""
video = cv2.VideoCapture(video_path)
fps = video.get(cv2.CAP_PROP_FPS)
frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
video.release()
return {"fps": fps, "frame_count": frame_count, "width": width, "height": height}
Video Processing
Object Detection
Redaction
Moondream2-2025-01-09
Promptable Video Redaction
A tool that uses Moondream 2B to detect and redact objects from videos with multiple visualization styles, including censoring, bounding boxes, and hitmarkers.
Key Features
๐ฅ
Multiple Visualization Styles
Choose between censoring (black boxes), traditional bounding boxes, or Call of Duty style hitmarkers for detected objects.
๐
Grid-based Detection
Split frames into customizable grids for improved detection accuracy on complex scenes.
๐ฌ
Natural Language Detection
Specify objects to detect using natural language descriptions, leveraging Moondream's vision-language capabilities.
๐
Web Interface
User-friendly web interface built with Gradio for easy video processing and configuration.
Code Samples
promptable-video-redaction.py
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import cv2
import numpy as np
from PIL import Image
import argparse
def process_video(input_path, detect_prompt, box_style='censor', grid_size=(1,1)):
"""Process video with Moondream2 for object detection and redaction."""
# Initialize Moondream model
model_id = "vikhyatk/moondream2"
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Open video
cap = cv2.VideoCapture(input_path)
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Process frames
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# Convert frame to PIL Image for Moondream
pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
# Process with grid-based detection if enabled
for row in range(grid_size[0]):
for col in range(grid_size[1]):
# Process region and apply visualization
process_region(frame, pil_image, model, tokenizer,
detect_prompt, box_style, row, col, grid_size)
cap.release()
Video Intelligence
Face Detection
Gaze Detection
Moondream2-2025-01-09
Gaze Detection Video Processor
A video processing application that uses Moondream 2 to detect faces and track gaze directions in videos, with real-time visualization of face detections and gaze directions using dynamic visual effects.
Key Features
๐๏ธ
Multi-Face Gaze Tracking
Detect and track gaze directions for multiple faces simultaneously in video frames.
๐จ
Dynamic Visualization
Real-time visualization with colored bounding boxes, gradient lines for gaze direction, and gaze target points.
๐ฅ
Video Processing
Support for common video formats (.mp4, .avi, .mov, .mkv) with progress tracking and batch processing capabilities.
โก
GPU Acceleration
Optimized for GPU processing with automatic fallback to CPU when needed.
Code Samples
gaze-detection-video.py
"""
Gaze Detection Video Processor using Moondream 2
------------------------------------------------
Read the README.md file for more information on how to use this script.
"""
import torch
import cv2
from transformers import AutoModelForCausalLM, AutoTokenizer
from typing import Tuple, Optional
from contextlib import contextmanager
def initialize_model() -> Tuple[Optional[AutoModelForCausalLM], Optional[AutoTokenizer]]:
"""Initialize the Moondream 2 model with error handling."""
try:
print("Initializing Moondream 2 model...")
model_id = "vikhyatk/moondream2"
revision = "2025-01-09" # Specify revision for stability
device = "cuda" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained(
model_id,
revision=revision,
trust_remote_code=True,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
device_map={"": device} if device == "cuda" else None
)
model = model.to(device).eval()
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
return model, tokenizer
except Exception as e:
print(f"Error initializing model: {e}")
return None, None
@contextmanager
def video_handler(input_path: str, output_path: str) -> Tuple[cv2.VideoCapture, cv2.VideoWriter]:
"""Context manager for handling video capture and writer."""
cap = cv2.VideoCapture(input_path)
if not cap.isOpened():
raise ValueError(f"Could not open video file: {input_path}")
# Set up video writer with source properties
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
try:
yield cap, out
finally:
cap.release()
out.release()
cv2.destroyAllWindows()
Gradio
Interactive UI
Object Detection
VQA
Interactive Image Analysis Demo
A Gradio-based web interface for real-time image analysis, featuring visual question answering and object detection with bounding box visualization.
Key Features
โก๏ธ
Streaming Responses
Get real-time text generation with token-by-token streaming for a more interactive experience.
๐ฏ
Automatic Annotations
Visualize object detection results with automatically generated bounding boxes overlaid on images.
๐
Flexible Runtime
Switch between CPU and GPU inference with automatic device detection and optimization.
๐
Simple Deployment
Deploy as a standalone web app or integrate into existing Gradio applications with minimal configuration.
Code Samples
gradio_demo.py
# Core setup
model_id = "vikhyatk/moondream2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
moondream = Moondream.from_pretrained(model_id).to(device)
# Create the interface
with gr.Blocks() as demo: gr.Markdown("# ๐ moondream") with gr.Row(): prompt = gr.Textbox(label="Input Prompt", value="Describe this image.") submit = gr.Button("Submit") with
gr.Row(): img = gr.Image(type="pil", label="Upload an Image") with gr.Column(): output = gr.Markdown(label="Response") ann = gr.Image(visible=False, label="Annotated Image")