feat: Complete simulation engine with 10 components and level validation

COMPLETE SIMULATION SYSTEM IMPLEMENTATION ## New Simulation Components (7 added): - Database: Read/write latency, replication overhead, RPS capacity - Cache: In-memory caching with LRU/LFU/FIFO/Random eviction policies - Message Queue: FIFO processing, retention, backpressure, processing rate - Microservice: Auto-scaling, resource capacity, load balancing across instances - Monitoring/Alerting: Multi-metric alerting (latency, errors, queue size) - Third Party Service: External API reliability, rate limiting, failure modeling - Data Pipeline: Batch processing with 10 transformation types ## Enhanced Existing Components: - Web Server: Fixed property naming (rpsCapacity) - CDN: Fixed property naming (ttl) - Load Balancer: Maintained existing functionality - Engine: Added smart topology-based entry point detection ## Level Validation System: - Complete pass/fail game mechanics with scoring (0-100) - Performance validation: throughput, latency, availability, cost - Component validation: mustInclude, mustNotInclude, minReplicas - Detailed feedback with specific requirement failures - Smart scoring with performance bonuses ## Frontend Integration: - Real simulation execution (replaced mock data) - Level information extraction from URL paths - Rich results display with pass/fail feedback - Automatic entry node detection from design topology ## Infrastructure Updates: - Design Schema: Added missing properties, fixed coordinate precision (float64) - Authentication: GitHub OAuth protection for all game routes - Error Handling: Comprehensive validation and user feedback - Testing: 78 tests covering all components and integration scenarios ## Technical Achievements: - 100% simulation component coverage (10/10 components) - Realistic performance modeling for all component types - Discrete-event simulation with proper state management - Production-ready code without emojis - Comprehensive test suite with integration testing ## Breaking Changes: - Position coordinates now use float64 for precision - /simulate endpoint now requires authentication - Request/response format updated for level validation This completes the core simulation engine implementation and enables a complete educational game experience for learning system design.
7 months ago · c7e0307f08
34 changed files with 5540 additions and 35 deletions
--- a/internal/design/design.go
+++ b/internal/design/design.go
@ -10,8 +10,8 @@ type Node struct {
				@@ -10,8 +10,8 @@ type Node struct {
 }

 type Position struct {
-	X int `json:"x"`
-	Y int `json:"y"`
+	X float64 `json:"x"`
+	Y float64 `json:"y"`
 }

 type Connection struct {
@ -46,8 +46,10 @@ type CDN struct {
				@@ -46,8 +46,10 @@ type CDN struct {
 }

 type Database struct {
-	Label       string `json:"label"`
-	Replication int    `json:"replication"`
+	Label         string `json:"label"`
+	Replication   int    `json:"replication"`
+	MaxRPS        int    `json:"maxRPS"`
+	BaseLatencyMs int    `json:"baseLatencyMs"`
 }

 type DataPipeline struct {
@ -65,13 +67,14 @@ type MessageQueue struct {
				@@ -65,13 +67,14 @@ type MessageQueue struct {
 	Label            string `json:"label"`
 	QueueCapacity    int    `json:"queueCapacity"`
 	RetentionSeconds int    `json:"retentionSeconds"`
+	ProcessingRate   int    `json:"processingRate"`
 }

 type Microservice struct {
 	Label           string `json:"label"`
 	InstanceCount   int    `json:"instanceCount"`
 	CPU             int    `json:"cpu"`
-	RAMGb           int    `json:"ramGb"`
+	RamGb           int    `json:"ramGb"`
 	RPSCapacity     int    `json:"rpsCapacity"`
 	MonthlyUSD      int    `json:"monthlyUsd"`
 	ScalingStrategy string `json:"scalingStrategy"`
--- a/internal/simulation/cache.go
+++ b/internal/simulation/cache.go
@ -0,0 +1,180 @@
				@@ -0,0 +1,180 @@
+package simulation
+
+import (
+	"time"
+)
+
+type CacheLogic struct{}
+
+type CacheEntry struct {
+	Data        string
+	Timestamp   int
+	AccessTime  int
+	AccessCount int
+	InsertOrder int
+}
+
+func (c CacheLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
+	// Extract cache properties
+	cacheTTL := int(AsFloat64(props["cacheTTL"]))
+	if cacheTTL == 0 {
+		cacheTTL = 300000 // default 5 minutes in ms
+	}
+
+	maxEntries := int(AsFloat64(props["maxEntries"]))
+	if maxEntries == 0 {
+		maxEntries = 1000 // default max entries
+	}
+
+	evictionPolicy := AsString(props["evictionPolicy"])
+	if evictionPolicy == "" {
+		evictionPolicy = "LRU" // default eviction policy
+	}
+
+	// Initialize cache data structures in props
+	cacheData, ok := props["_cacheData"].(map[string]*CacheEntry)
+	if !ok {
+		cacheData = make(map[string]*CacheEntry)
+		props["_cacheData"] = cacheData
+	}
+
+	insertCounter, ok := props["_insertCounter"].(int)
+	if !ok {
+		insertCounter = 0
+	}
+
+	// Current timestamp for this tick
+	currentTime := tick * 100 // assuming 100ms per tick
+
+	// Clean up expired entries first
+	c.cleanExpiredEntries(cacheData, currentTime, cacheTTL)
+
+	output := []*Request{}
+
+	for _, req := range queue {
+		cacheKey := req.ID + "-" + req.Type // Use request ID and type as cache key
+
+		// Check for cache hit
+		entry, hit := cacheData[cacheKey]
+		if hit && !c.isExpired(entry, currentTime, cacheTTL) {
+			// Cache hit - return immediately with minimal latency
+			reqCopy := *req
+			reqCopy.LatencyMS += 1 // 1ms for in-memory access
+			reqCopy.Path = append(reqCopy.Path, "cache-hit")
+
+			// Update access tracking for eviction policies
+			entry.AccessTime = currentTime
+			entry.AccessCount++
+
+			output = append(output, &reqCopy)
+		} else {
+			// Cache miss - forward request downstream
+			reqCopy := *req
+			reqCopy.Path = append(reqCopy.Path, "cache-miss")
+
+			// For simulation purposes, we'll cache the "response" immediately
+			// In a real system, this would happen when the response comes back
+			insertCounter++
+			newEntry := &CacheEntry{
+				Data:        "cached-data", // In real implementation, this would be the response data
+				Timestamp:   currentTime,
+				AccessTime:  currentTime,
+				AccessCount: 1,
+				InsertOrder: insertCounter,
+			}
+
+			// First check if we need to evict before adding
+			if len(cacheData) >= maxEntries {
+				c.evictEntry(cacheData, evictionPolicy)
+			}
+
+			// Now add the new entry
+			cacheData[cacheKey] = newEntry
+
+			output = append(output, &reqCopy)
+		}
+	}
+
+	// Update insert counter in props
+	props["_insertCounter"] = insertCounter
+
+	return output, true
+}
+
+func (c CacheLogic) cleanExpiredEntries(cacheData map[string]*CacheEntry, currentTime, ttl int) {
+	for key, entry := range cacheData {
+		if c.isExpired(entry, currentTime, ttl) {
+			delete(cacheData, key)
+		}
+	}
+}
+
+func (c CacheLogic) isExpired(entry *CacheEntry, currentTime, ttl int) bool {
+	return (currentTime - entry.Timestamp) > ttl
+}
+
+func (c CacheLogic) evictEntry(cacheData map[string]*CacheEntry, policy string) {
+	if len(cacheData) == 0 {
+		return
+	}
+
+	var keyToEvict string
+
+	switch policy {
+	case "LRU":
+		// Evict least recently used
+		oldestTime := int(^uint(0) >> 1) // Max int
+		for key, entry := range cacheData {
+			if entry.AccessTime < oldestTime {
+				oldestTime = entry.AccessTime
+				keyToEvict = key
+			}
+		}
+
+	case "LFU":
+		// Evict least frequently used
+		minCount := int(^uint(0) >> 1) // Max int
+		for key, entry := range cacheData {
+			if entry.AccessCount < minCount {
+				minCount = entry.AccessCount
+				keyToEvict = key
+			}
+		}
+
+	case "FIFO":
+		// Evict first in (oldest insert order)
+		minOrder := int(^uint(0) >> 1) // Max int
+		for key, entry := range cacheData {
+			if entry.InsertOrder < minOrder {
+				minOrder = entry.InsertOrder
+				keyToEvict = key
+			}
+		}
+
+	case "random":
+		// Evict random entry
+		keys := make([]string, 0, len(cacheData))
+		for key := range cacheData {
+			keys = append(keys, key)
+		}
+		if len(keys) > 0 {
+			// Use timestamp as pseudo-random seed
+			seed := time.Now().UnixNano()
+			keyToEvict = keys[seed%int64(len(keys))]
+		}
+
+	default:
+		// Default to LRU
+		oldestTime := int(^uint(0) >> 1)
+		for key, entry := range cacheData {
+			if entry.AccessTime < oldestTime {
+				oldestTime = entry.AccessTime
+				keyToEvict = key
+			}
+		}
+	}
+
+	if keyToEvict != "" {
+		delete(cacheData, keyToEvict)
+	}
+}
--- a/internal/simulation/cache_test.go
+++ b/internal/simulation/cache_test.go
@ -0,0 +1,319 @@
				@@ -0,0 +1,319 @@
+package simulation
+
+import (
+	"testing"
+)
+
+func TestCacheLogic_CacheHitMiss(t *testing.T) {
+	cache := CacheLogic{}
+
+	props := map[string]any{
+		"cacheTTL":       10000, // 10 seconds
+		"maxEntries":     100,
+		"evictionPolicy": "LRU",
+	}
+
+	// First request should be a miss
+	req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0, Path: []string{"start"}}}
+	output1, alive := cache.Tick(props, req1, 1)
+
+	if !alive {
+		t.Errorf("Cache should be alive")
+	}
+
+	if len(output1) != 1 {
+		t.Errorf("Expected 1 output request, got %d", len(output1))
+	}
+
+	// Should be cache miss
+	if output1[0].LatencyMS != 0 { // No latency added for miss
+		t.Errorf("Expected 0ms latency for cache miss, got %dms", output1[0].LatencyMS)
+	}
+
+	// Check path contains cache-miss
+	found := false
+	for _, pathItem := range output1[0].Path {
+		if pathItem == "cache-miss" {
+			found = true
+			break
+		}
+	}
+	if !found {
+		t.Errorf("Expected cache-miss in path, got %v", output1[0].Path)
+	}
+
+	// Second identical request should be a hit
+	req2 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0, Path: []string{"start"}}}
+	output2, _ := cache.Tick(props, req2, 2)
+
+	if len(output2) != 1 {
+		t.Errorf("Expected 1 output request, got %d", len(output2))
+	}
+
+	// Should be cache hit with 1ms latency
+	if output2[0].LatencyMS != 1 {
+		t.Errorf("Expected 1ms latency for cache hit, got %dms", output2[0].LatencyMS)
+	}
+
+	// Check path contains cache-hit
+	found = false
+	for _, pathItem := range output2[0].Path {
+		if pathItem == "cache-hit" {
+			found = true
+			break
+		}
+	}
+	if !found {
+		t.Errorf("Expected cache-hit in path, got %v", output2[0].Path)
+	}
+}
+
+func TestCacheLogic_TTLExpiration(t *testing.T) {
+	cache := CacheLogic{}
+
+	props := map[string]any{
+		"cacheTTL":       1000, // 1 second
+		"maxEntries":     100,
+		"evictionPolicy": "LRU",
+	}
+
+	// First request - cache miss
+	req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req1, 1)
+
+	// Second request within TTL - cache hit
+	req2 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	output2, _ := cache.Tick(props, req2, 5) // 5 * 100ms = 500ms later
+
+	if output2[0].LatencyMS != 1 {
+		t.Errorf("Expected cache hit (1ms), got %dms", output2[0].LatencyMS)
+	}
+
+	// Third request after TTL expiration - cache miss
+	req3 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	output3, _ := cache.Tick(props, req3, 15) // 15 * 100ms = 1500ms later (expired)
+
+	if output3[0].LatencyMS != 0 {
+		t.Errorf("Expected cache miss (0ms) after TTL expiration, got %dms", output3[0].LatencyMS)
+	}
+}
+
+func TestCacheLogic_MaxEntriesEviction(t *testing.T) {
+	cache := CacheLogic{}
+
+	props := map[string]any{
+		"cacheTTL":       10000,
+		"maxEntries":     2, // Small cache size
+		"evictionPolicy": "LRU",
+	}
+
+	// Add first entry
+	req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req1, 1)
+
+	// Add second entry
+	req2 := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req2, 2)
+
+	// Verify both are cached
+	req1Check := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	output1Check, _ := cache.Tick(props, req1Check, 3)
+	if output1Check[0].LatencyMS != 1 {
+		t.Errorf("Expected cache hit for req1, got %dms latency", output1Check[0].LatencyMS)
+	}
+
+	req2Check := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
+	output2Check, _ := cache.Tick(props, req2Check, 4)
+	if output2Check[0].LatencyMS != 1 {
+		t.Errorf("Expected cache hit for req2, got %dms latency", output2Check[0].LatencyMS)
+	}
+
+	// Add third entry (should evict LRU entry)
+	req3 := []*Request{{ID: "req3", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req3, 5)
+
+	// req1 was accessed at tick 3, req2 at tick 4, so req1 should be evicted
+	req1CheckAgain := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	output1, _ := cache.Tick(props, req1CheckAgain, 6)
+	if output1[0].LatencyMS != 0 {
+		t.Errorf("Expected cache miss for LRU evicted entry, got %dms latency", output1[0].LatencyMS)
+	}
+
+	// After adding req1 back, the cache should be at capacity with different items
+	// We don't test further to avoid complex cascading eviction scenarios
+}
+
+func TestCacheLogic_LRUEviction(t *testing.T) {
+	cache := CacheLogic{}
+
+	props := map[string]any{
+		"cacheTTL":       10000,
+		"maxEntries":     2,
+		"evictionPolicy": "LRU",
+	}
+
+	// Add two entries
+	req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req1, 1)
+
+	req2 := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req2, 2)
+
+	// Access first entry (make it recently used)
+	req1Access := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req1Access, 3)
+
+	// Add third entry (should evict req2, since req1 was more recently accessed)
+	req3 := []*Request{{ID: "req3", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req3, 4)
+
+	// Verify that req2 was evicted (should be cache miss)
+	req2Check := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
+	output2, _ := cache.Tick(props, req2Check, 5)
+
+	if output2[0].LatencyMS != 0 {
+		t.Errorf("Expected cache miss for LRU evicted entry, got %dms latency", output2[0].LatencyMS)
+	}
+
+	// After adding req2 back, the cache should contain {req2, req1} or {req2, req3}
+	// depending on LRU logic. We don't test further to avoid cascading evictions.
+}
+
+func TestCacheLogic_FIFOEviction(t *testing.T) {
+	cache := CacheLogic{}
+
+	props := map[string]any{
+		"cacheTTL":       10000,
+		"maxEntries":     2,
+		"evictionPolicy": "FIFO",
+	}
+
+	// Add two entries
+	req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req1, 1)
+
+	req2 := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req2, 2)
+
+	// Access first entry multiple times (shouldn't matter for FIFO)
+	req1Access := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req1Access, 3)
+	cache.Tick(props, req1Access, 4)
+
+	// Add third entry (should evict req1, the first inserted)
+	req3 := []*Request{{ID: "req3", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req3, 5)
+
+	// Check that req1 was evicted (first in, first out)
+	req1Check := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	output1, _ := cache.Tick(props, req1Check, 6)
+
+	if output1[0].LatencyMS != 0 {
+		t.Errorf("Expected cache miss for FIFO evicted entry, got %dms latency", output1[0].LatencyMS)
+	}
+
+	// After adding req1 back, the cache should contain {req2, req1} or {req3, req1}
+	// depending on FIFO logic. We don't test further to avoid cascading evictions.
+}
+
+func TestCacheLogic_DefaultValues(t *testing.T) {
+	cache := CacheLogic{}
+
+	// Empty props should use defaults
+	props := map[string]any{}
+
+	req := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	output, _ := cache.Tick(props, req, 1)
+
+	if len(output) != 1 {
+		t.Errorf("Expected 1 output request")
+	}
+
+	// Should be cache miss with 0ms latency
+	if output[0].LatencyMS != 0 {
+		t.Errorf("Expected 0ms latency for cache miss with defaults, got %dms", output[0].LatencyMS)
+	}
+
+	// Second request should be cache hit
+	req2 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	output2, _ := cache.Tick(props, req2, 2)
+
+	if output2[0].LatencyMS != 1 {
+		t.Errorf("Expected 1ms latency for cache hit, got %dms", output2[0].LatencyMS)
+	}
+}
+
+func TestCacheLogic_SimpleEviction(t *testing.T) {
+	cache := CacheLogic{}
+
+	props := map[string]any{
+		"cacheTTL":       10000,
+		"maxEntries":     1, // Only 1 entry allowed
+		"evictionPolicy": "LRU",
+	}
+
+	// Add first entry
+	req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	output1, _ := cache.Tick(props, req1, 1)
+	if output1[0].LatencyMS != 0 {
+		t.Errorf("First request should be cache miss, got %dms", output1[0].LatencyMS)
+	}
+
+	// Check it's cached
+	req1Again := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	output1Again, _ := cache.Tick(props, req1Again, 2)
+	if output1Again[0].LatencyMS != 1 {
+		t.Errorf("Second request should be cache hit, got %dms", output1Again[0].LatencyMS)
+	}
+
+	// Add second entry (should evict first)
+	req2 := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
+	output2, _ := cache.Tick(props, req2, 3)
+	if output2[0].LatencyMS != 0 {
+		t.Errorf("New request should be cache miss, got %dms", output2[0].LatencyMS)
+	}
+
+	// Check that first entry is now evicted
+	req1Final := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	output1Final, _ := cache.Tick(props, req1Final, 4)
+	if output1Final[0].LatencyMS != 0 {
+		t.Errorf("Evicted entry should be cache miss, got %dms", output1Final[0].LatencyMS)
+	}
+
+	// Check that second entry is now also evicted (since req1 was re-added in step 4)
+	req2Again := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
+	output2Again, _ := cache.Tick(props, req2Again, 5)
+	if output2Again[0].LatencyMS != 0 {
+		t.Errorf("Re-evicted entry should be cache miss, got %dms", output2Again[0].LatencyMS)
+	}
+}
+
+func TestCacheLogic_DifferentRequestTypes(t *testing.T) {
+	cache := CacheLogic{}
+
+	props := map[string]any{
+		"cacheTTL":       10000,
+		"maxEntries":     100,
+		"evictionPolicy": "LRU",
+	}
+
+	// Same ID but different type should be different cache entries
+	req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	cache.Tick(props, req1, 1)
+
+	req2 := []*Request{{ID: "req1", Type: "POST", LatencyMS: 0}}
+	output2, _ := cache.Tick(props, req2, 2)
+
+	// Should be cache miss since different type
+	if output2[0].LatencyMS != 0 {
+		t.Errorf("Expected cache miss for different request type, got %dms latency", output2[0].LatencyMS)
+	}
+
+	// Original GET should still be cached
+	req1Again := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
+	output1, _ := cache.Tick(props, req1Again, 3)
+
+	if output1[0].LatencyMS != 1 {
+		t.Errorf("Expected cache hit for original request type, got %dms latency", output1[0].LatencyMS)
+	}
+}
--- a/internal/simulation/cdn.go
+++ b/internal/simulation/cdn.go
@ -5,7 +5,7 @@ type CDNLogic struct{}
				@@ -5,7 +5,7 @@ type CDNLogic struct{}
 func (c CDNLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {

 	// read the ttl for cached content
-	ttl := int(AsFloat64(props["ttlMs"]))
+	ttl := int(AsFloat64(props["ttl"]))

 	// retrieve the cdn's cache from props
 	cache, ok := props["_cache"].(map[string]int)
--- a/internal/simulation/cdn_test.go
+++ b/internal/simulation/cdn_test.go
@ -9,7 +9,7 @@ func TestCDNLogic(t *testing.T) {
				@@ -9,7 +9,7 @@ func TestCDNLogic(t *testing.T) {
 	cdn := CDNLogic{}
 	cache := map[string]int{} // shared mutable cache
 	props := map[string]any{
-		"ttlMs":  float64(1000),
+		"ttl":    float64(1000),
 		"_cache": cache,
 	}

--- a/internal/simulation/database.go
+++ b/internal/simulation/database.go
@ -0,0 +1,61 @@
				@@ -0,0 +1,61 @@
+package simulation
+
+type DatabaseLogic struct{}
+
+func (d DatabaseLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
+	// Extract database properties
+	replication := int(AsFloat64(props["replication"]))
+	if replication == 0 {
+		replication = 1 // default
+	}
+
+	// Database capacity (could be based on instance size or explicit RPS)
+	maxRPS := int(AsFloat64(props["maxRPS"]))
+	if maxRPS == 0 {
+		maxRPS = 1000 // default capacity
+	}
+
+	// Base latency for database operations
+	baseLatencyMs := int(AsFloat64(props["baseLatencyMs"]))
+	if baseLatencyMs == 0 {
+		baseLatencyMs = 10 // default 10ms for local DB operations
+	}
+
+	// Process requests up to capacity
+	toProcess := queue
+	if len(queue) > maxRPS {
+		toProcess = queue[:maxRPS]
+		// TODO: Could add queue overflow logic here
+	}
+
+	output := []*Request{}
+
+	for _, req := range toProcess {
+		// Add database latency to the request
+		reqCopy := *req
+
+		// Simulate different operation types and their latencies
+		operationLatency := baseLatencyMs
+
+		// Simple heuristic: reads are faster than writes
+		if req.Type == "GET" || req.Type == "READ" {
+			operationLatency = baseLatencyMs
+		} else if req.Type == "POST" || req.Type == "WRITE" {
+			operationLatency = baseLatencyMs * 2 // writes take longer
+		}
+
+		// Replication factor affects write latency
+		if req.Type == "POST" || req.Type == "WRITE" {
+			operationLatency += (replication - 1) * 5 // 5ms per replica
+		}
+
+		reqCopy.LatencyMS += operationLatency
+		reqCopy.Path = append(reqCopy.Path, "database-processed")
+
+		output = append(output, &reqCopy)
+	}
+
+	// Database health (could simulate failures, connection issues, etc.)
+	// For now, assume always healthy
+	return output, true
+}
--- a/internal/simulation/database_test.go
+++ b/internal/simulation/database_test.go
@ -0,0 +1,139 @@
				@@ -0,0 +1,139 @@
+package simulation
+
+import (
+	"testing"
+)
+
+func TestDatabaseLogic_BasicProcessing(t *testing.T) {
+	db := DatabaseLogic{}
+
+	props := map[string]any{
+		"replication":   2,
+		"maxRPS":        100,
+		"baseLatencyMs": 15,
+	}
+
+	// Create test requests
+	reqs := []*Request{
+		{ID: "req1", Type: "GET", LatencyMS: 0, Path: []string{"start"}},
+		{ID: "req2", Type: "POST", LatencyMS: 0, Path: []string{"start"}},
+	}
+
+	output, alive := db.Tick(props, reqs, 1)
+
+	if !alive {
+		t.Errorf("Database should be alive")
+	}
+
+	if len(output) != 2 {
+		t.Errorf("Expected 2 output requests, got %d", len(output))
+	}
+
+	// Check read latency (base latency)
+	readReq := output[0]
+	if readReq.LatencyMS != 15 {
+		t.Errorf("Expected read latency 15ms, got %dms", readReq.LatencyMS)
+	}
+
+	// Check write latency (base * 2 + replication penalty)
+	writeReq := output[1]
+	expectedWriteLatency := 15*2 + (2-1)*5 // 30 + 5 = 35ms
+	if writeReq.LatencyMS != expectedWriteLatency {
+		t.Errorf("Expected write latency %dms, got %dms", expectedWriteLatency, writeReq.LatencyMS)
+	}
+}
+
+func TestDatabaseLogic_CapacityLimit(t *testing.T) {
+	db := DatabaseLogic{}
+
+	props := map[string]any{
+		"maxRPS":        2,
+		"baseLatencyMs": 10,
+	}
+
+	// Create more requests than capacity
+	reqs := []*Request{
+		{ID: "req1", Type: "GET"},
+		{ID: "req2", Type: "GET"},
+		{ID: "req3", Type: "GET"}, // This should be dropped
+	}
+
+	output, _ := db.Tick(props, reqs, 1)
+
+	if len(output) != 2 {
+		t.Errorf("Expected capacity limit of 2, but processed %d requests", len(output))
+	}
+}
+
+func TestDatabaseLogic_DefaultValues(t *testing.T) {
+	db := DatabaseLogic{}
+
+	// Empty props should use defaults
+	props := map[string]any{}
+
+	reqs := []*Request{
+		{ID: "req1", Type: "GET", LatencyMS: 0},
+	}
+
+	output, _ := db.Tick(props, reqs, 1)
+
+	if len(output) != 1 {
+		t.Errorf("Expected 1 output request")
+	}
+
+	// Should use default 10ms base latency
+	if output[0].LatencyMS != 10 {
+		t.Errorf("Expected default latency 10ms, got %dms", output[0].LatencyMS)
+	}
+}
+
+func TestDatabaseLogic_ReplicationEffect(t *testing.T) {
+	db := DatabaseLogic{}
+
+	// Test with high replication
+	props := map[string]any{
+		"replication":   5,
+		"baseLatencyMs": 10,
+	}
+
+	reqs := []*Request{
+		{ID: "req1", Type: "POST", LatencyMS: 0},
+	}
+
+	output, _ := db.Tick(props, reqs, 1)
+
+	if len(output) != 1 {
+		t.Errorf("Expected 1 output request")
+	}
+
+	// Write latency: base*2 + (replication-1)*5 = 10*2 + (5-1)*5 = 20 + 20 = 40ms
+	expectedLatency := 10*2 + (5-1)*5
+	if output[0].LatencyMS != expectedLatency {
+		t.Errorf("Expected latency %dms with replication=5, got %dms", expectedLatency, output[0].LatencyMS)
+	}
+}
+
+func TestDatabaseLogic_ReadVsWrite(t *testing.T) {
+	db := DatabaseLogic{}
+
+	props := map[string]any{
+		"replication":   1,
+		"baseLatencyMs": 20,
+	}
+
+	readReq := []*Request{{ID: "read", Type: "GET", LatencyMS: 0}}
+	writeReq := []*Request{{ID: "write", Type: "POST", LatencyMS: 0}}
+
+	readOutput, _ := db.Tick(props, readReq, 1)
+	writeOutput, _ := db.Tick(props, writeReq, 1)
+
+	// Read should be base latency
+	if readOutput[0].LatencyMS != 20 {
+		t.Errorf("Expected read latency 20ms, got %dms", readOutput[0].LatencyMS)
+	}
+
+	// Write should be double base latency (no replication penalty with replication=1)
+	if writeOutput[0].LatencyMS != 40 {
+		t.Errorf("Expected write latency 40ms, got %dms", writeOutput[0].LatencyMS)
+	}
+}
--- a/internal/simulation/datapipeline.go
+++ b/internal/simulation/datapipeline.go
@ -0,0 +1,203 @@
				@@ -0,0 +1,203 @@
+package simulation
+
+type DataPipelineLogic struct{}
+
+type DataBatch struct {
+	ID           string
+	RecordCount  int
+	Timestamp    int
+	ProcessingMS int
+}
+
+type PipelineState struct {
+	ProcessingQueue  []DataBatch
+	CompletedBatches int
+	TotalRecords     int
+	BacklogSize      int
+}
+
+func (d DataPipelineLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
+	// Extract data pipeline properties
+	batchSize := int(AsFloat64(props["batchSize"]))
+	if batchSize == 0 {
+		batchSize = 500 // default batch size
+	}
+
+	transformation := AsString(props["transformation"])
+	if transformation == "" {
+		transformation = "map" // default transformation
+	}
+
+	// Get pipeline state from props (persistent state)
+	state, ok := props["_pipelineState"].(PipelineState)
+	if !ok {
+		state = PipelineState{
+			ProcessingQueue:  []DataBatch{},
+			CompletedBatches: 0,
+			TotalRecords:     0,
+			BacklogSize:      0,
+		}
+	}
+
+	currentTime := tick * 100 // Convert tick to milliseconds
+
+	// Convert incoming requests to data batches
+	if len(queue) > 0 {
+		// Group requests into batches
+		batches := d.createBatches(queue, batchSize, currentTime, transformation)
+
+		// Add batches to processing queue
+		state.ProcessingQueue = append(state.ProcessingQueue, batches...)
+		state.BacklogSize += len(queue)
+	}
+
+	// Process batches that are ready (completed their processing time)
+	output := []*Request{}
+	remainingBatches := []DataBatch{}
+
+	for _, batch := range state.ProcessingQueue {
+		if currentTime >= batch.Timestamp+batch.ProcessingMS {
+			// Batch is complete - create output requests
+			for i := 0; i < batch.RecordCount; i++ {
+				processedReq := &Request{
+					ID:        batch.ID + "-record-" + string(rune('0'+i)),
+					Timestamp: batch.Timestamp,
+					LatencyMS: batch.ProcessingMS,
+					Origin:    "data-pipeline",
+					Type:      "PROCESSED",
+					Path:      []string{"pipeline-" + transformation},
+				}
+				output = append(output, processedReq)
+			}
+
+			state.CompletedBatches++
+			state.TotalRecords += batch.RecordCount
+		} else {
+			// Batch still processing
+			remainingBatches = append(remainingBatches, batch)
+		}
+	}
+
+	state.ProcessingQueue = remainingBatches
+	state.BacklogSize = len(remainingBatches) * batchSize
+
+	// Update persistent state
+	props["_pipelineState"] = state
+
+	// Health check: pipeline is healthy if backlog is not too large
+	maxBacklogSize := batchSize * 20 // Allow up to 20 batches in backlog
+	healthy := state.BacklogSize < maxBacklogSize
+
+	return output, healthy
+}
+
+// createBatches groups requests into batches and calculates processing time
+func (d DataPipelineLogic) createBatches(requests []*Request, batchSize int, timestamp int, transformation string) []DataBatch {
+	batches := []DataBatch{}
+
+	for i := 0; i < len(requests); i += batchSize {
+		end := i + batchSize
+		if end > len(requests) {
+			end = len(requests)
+		}
+
+		recordCount := end - i
+		processingTime := d.calculateProcessingTime(recordCount, transformation)
+
+		batch := DataBatch{
+			ID:           "batch-" + string(rune('A'+len(batches))),
+			RecordCount:  recordCount,
+			Timestamp:    timestamp,
+			ProcessingMS: processingTime,
+		}
+
+		batches = append(batches, batch)
+	}
+
+	return batches
+}
+
+// calculateProcessingTime determines how long a batch takes to process based on transformation type
+func (d DataPipelineLogic) calculateProcessingTime(recordCount int, transformation string) int {
+	// Base processing time per record
+	baseTimePerRecord := d.getTransformationComplexity(transformation)
+
+	// Total time scales with record count but with some economies of scale
+	totalTime := float64(recordCount) * baseTimePerRecord
+
+	// Add batch overhead (setup, teardown, I/O)
+	batchOverhead := d.getBatchOverhead(transformation)
+	totalTime += batchOverhead
+
+	// Apply economies of scale for larger batches (slightly more efficient)
+	if recordCount > 100 {
+		scaleFactor := 0.9 // 10% efficiency gain for large batches
+		totalTime *= scaleFactor
+	}
+
+	return int(totalTime)
+}
+
+// getTransformationComplexity returns base processing time per record in milliseconds
+func (d DataPipelineLogic) getTransformationComplexity(transformation string) float64 {
+	switch transformation {
+	case "map":
+		return 1.0 // Simple field mapping
+	case "filter":
+		return 0.5 // Just evaluate conditions
+	case "sort":
+		return 3.0 // Sorting requires more compute
+	case "aggregate":
+		return 2.0 // Grouping and calculating aggregates
+	case "join":
+		return 5.0 // Most expensive - joining with other datasets
+	case "deduplicate":
+		return 2.5 // Hash-based deduplication
+	case "validate":
+		return 1.5 // Data validation and cleaning
+	case "enrich":
+		return 4.0 // Enriching with external data
+	case "compress":
+		return 1.2 // Compression processing
+	case "encrypt":
+		return 2.0 // Encryption overhead
+	default:
+		return 1.0 // Default to simple transformation
+	}
+}
+
+// getBatchOverhead returns fixed overhead time per batch in milliseconds
+func (d DataPipelineLogic) getBatchOverhead(transformation string) float64 {
+	switch transformation {
+	case "map", "filter", "validate":
+		return 50.0 // Low overhead for simple operations
+	case "sort", "aggregate", "deduplicate":
+		return 200.0 // Medium overhead for complex operations
+	case "join", "enrich":
+		return 500.0 // High overhead for operations requiring external data
+	case "compress", "encrypt":
+		return 100.0 // Medium overhead for I/O operations
+	default:
+		return 100.0 // Default overhead
+	}
+}
+
+// Helper function to get pipeline statistics
+func (d DataPipelineLogic) GetPipelineStats(props map[string]any) map[string]interface{} {
+	state, ok := props["_pipelineState"].(PipelineState)
+	if !ok {
+		return map[string]interface{}{
+			"completedBatches": 0,
+			"totalRecords":     0,
+			"backlogSize":      0,
+			"queuedBatches":    0,
+		}
+	}
+
+	return map[string]interface{}{
+		"completedBatches": state.CompletedBatches,
+		"totalRecords":     state.TotalRecords,
+		"backlogSize":      state.BacklogSize,
+		"queuedBatches":    len(state.ProcessingQueue),
+	}
+}
--- a/internal/simulation/datapipeline_test.go
+++ b/internal/simulation/datapipeline_test.go
@ -0,0 +1,396 @@
				@@ -0,0 +1,396 @@
+package simulation
+
+import (
+	"testing"
+)
+
+func TestDataPipelineLogic_BasicProcessing(t *testing.T) {
+	logic := DataPipelineLogic{}
+
+	props := map[string]any{
+		"batchSize":      100.0,
+		"transformation": "map",
+	}
+
+	// Create 50 requests (less than batch size)
+	requests := make([]*Request, 50)
+	for i := range requests {
+		requests[i] = &Request{ID: string(rune('1' + i)), Type: "DATA", LatencyMS: 0}
+	}
+
+	// First tick - should create batch and start processing
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected data pipeline to be healthy")
+	}
+
+	// Should not have output yet (batch is still processing)
+	if len(output) != 0 {
+		t.Errorf("Expected no output during processing, got %d", len(output))
+	}
+
+	// Check that batch was created
+	state, ok := props["_pipelineState"].(PipelineState)
+	if !ok {
+		t.Error("Expected pipeline state to be created")
+	}
+
+	if len(state.ProcessingQueue) != 1 {
+		t.Errorf("Expected 1 batch in processing queue, got %d", len(state.ProcessingQueue))
+	}
+
+	if state.ProcessingQueue[0].RecordCount != 50 {
+		t.Errorf("Expected batch with 50 records, got %d", state.ProcessingQueue[0].RecordCount)
+	}
+}
+
+func TestDataPipelineLogic_BatchCompletion(t *testing.T) {
+	logic := DataPipelineLogic{}
+
+	props := map[string]any{
+		"batchSize":      10.0,
+		"transformation": "filter", // Fast transformation
+	}
+
+	// Create 5 requests
+	requests := make([]*Request, 5)
+	for i := range requests {
+		requests[i] = &Request{ID: string(rune('1' + i)), Type: "DATA", LatencyMS: 0}
+	}
+
+	// First tick - start processing
+	logic.Tick(props, requests, 1)
+
+	// Wait enough ticks for processing to complete
+	// Filter transformation should complete quickly
+	var output []*Request
+	var healthy bool
+
+	for tick := 2; tick <= 10; tick++ {
+		output, healthy = logic.Tick(props, []*Request{}, tick)
+		if len(output) > 0 {
+			break
+		}
+	}
+
+	if !healthy {
+		t.Error("Expected data pipeline to be healthy")
+	}
+
+	// Should have output matching input count
+	if len(output) != 5 {
+		t.Errorf("Expected 5 output records, got %d", len(output))
+	}
+
+	// Check output structure
+	for _, req := range output {
+		if req.Type != "PROCESSED" {
+			t.Errorf("Expected PROCESSED type, got %s", req.Type)
+		}
+		if req.Origin != "data-pipeline" {
+			t.Errorf("Expected data-pipeline origin, got %s", req.Origin)
+		}
+		if len(req.Path) == 0 || req.Path[0] != "pipeline-filter" {
+			t.Error("Expected path to indicate filter transformation")
+		}
+	}
+}
+
+func TestDataPipelineLogic_MultipleBatches(t *testing.T) {
+	logic := DataPipelineLogic{}
+
+	props := map[string]any{
+		"batchSize":      10.0,
+		"transformation": "map",
+	}
+
+	// Create 25 requests (should create 3 batches: 10, 10, 5)
+	requests := make([]*Request, 25)
+	for i := range requests {
+		requests[i] = &Request{ID: string(rune('1' + i)), Type: "DATA", LatencyMS: 0}
+	}
+
+	// First tick - create batches
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected data pipeline to be healthy")
+	}
+
+	if len(output) != 0 {
+		t.Error("Expected no immediate output")
+	}
+
+	// Check that 3 batches were created
+	state, ok := props["_pipelineState"].(PipelineState)
+	if !ok {
+		t.Error("Expected pipeline state to be created")
+	}
+
+	if len(state.ProcessingQueue) != 3 {
+		t.Errorf("Expected 3 batches in processing queue, got %d", len(state.ProcessingQueue))
+	}
+
+	// Verify batch sizes
+	expectedSizes := []int{10, 10, 5}
+	for i, batch := range state.ProcessingQueue {
+		if batch.RecordCount != expectedSizes[i] {
+			t.Errorf("Expected batch %d to have %d records, got %d",
+				i, expectedSizes[i], batch.RecordCount)
+		}
+	}
+}
+
+func TestDataPipelineLogic_TransformationComplexity(t *testing.T) {
+	logic := DataPipelineLogic{}
+
+	transformations := []string{"filter", "map", "sort", "aggregate", "join"}
+
+	for _, transformation := range transformations {
+		t.Run(transformation, func(t *testing.T) {
+			complexity := logic.getTransformationComplexity(transformation)
+
+			// Verify relative complexity ordering
+			switch transformation {
+			case "filter":
+				if complexity >= logic.getTransformationComplexity("map") {
+					t.Error("Filter should be simpler than map")
+				}
+			case "join":
+				if complexity <= logic.getTransformationComplexity("aggregate") {
+					t.Error("Join should be more complex than aggregate")
+				}
+			case "sort":
+				if complexity <= logic.getTransformationComplexity("map") {
+					t.Error("Sort should be more complex than map")
+				}
+			}
+
+			if complexity <= 0 {
+				t.Errorf("Expected positive complexity for %s", transformation)
+			}
+		})
+	}
+}
+
+func TestDataPipelineLogic_BatchOverhead(t *testing.T) {
+	logic := DataPipelineLogic{}
+
+	// Test different overhead levels
+	testCases := []struct {
+		transformation string
+		expectedRange  [2]float64 // [min, max]
+	}{
+		{"map", [2]float64{0, 100}},    // Low overhead
+		{"join", [2]float64{300, 600}}, // High overhead
+		{"sort", [2]float64{150, 300}}, // Medium overhead
+	}
+
+	for _, tc := range testCases {
+		overhead := logic.getBatchOverhead(tc.transformation)
+
+		if overhead < tc.expectedRange[0] || overhead > tc.expectedRange[1] {
+			t.Errorf("Expected %s overhead between %.0f-%.0f, got %.0f",
+				tc.transformation, tc.expectedRange[0], tc.expectedRange[1], overhead)
+		}
+	}
+}
+
+func TestDataPipelineLogic_ProcessingTime(t *testing.T) {
+	logic := DataPipelineLogic{}
+
+	// Test that processing time scales with record count
+	smallBatch := logic.calculateProcessingTime(10, "map")
+	largeBatch := logic.calculateProcessingTime(100, "map")
+
+	if largeBatch <= smallBatch {
+		t.Error("Expected larger batch to take more time")
+	}
+
+	// Test that complex transformations take longer
+	simpleTime := logic.calculateProcessingTime(50, "filter")
+	complexTime := logic.calculateProcessingTime(50, "join")
+
+	if complexTime <= simpleTime {
+		t.Error("Expected complex transformation to take longer")
+	}
+
+	// Test economies of scale (large batches should be more efficient per record)
+	smallPerRecord := float64(smallBatch) / 10.0
+	largePerRecord := float64(largeBatch) / 100.0
+
+	if largePerRecord >= smallPerRecord {
+		t.Error("Expected economies of scale for larger batches")
+	}
+}
+
+func TestDataPipelineLogic_HealthCheck(t *testing.T) {
+	logic := DataPipelineLogic{}
+
+	props := map[string]any{
+		"batchSize":      10.0,
+		"transformation": "join", // Slow transformation
+	}
+
+	// Create a large number of requests to test backlog health
+	requests := make([]*Request, 300) // 30 batches (above healthy threshold)
+	for i := range requests {
+		requests[i] = &Request{ID: string(rune('1' + (i % 26))), Type: "DATA", LatencyMS: 0}
+	}
+
+	// First tick - should create many batches
+	output, healthy := logic.Tick(props, requests, 1)
+
+	// Should be unhealthy due to large backlog
+	if healthy {
+		t.Error("Expected data pipeline to be unhealthy with large backlog")
+	}
+
+	if len(output) != 0 {
+		t.Error("Expected no immediate output with slow transformation")
+	}
+
+	// Check backlog size
+	state, ok := props["_pipelineState"].(PipelineState)
+	if !ok {
+		t.Error("Expected pipeline state to be created")
+	}
+
+	if state.BacklogSize < 200 {
+		t.Errorf("Expected large backlog, got %d", state.BacklogSize)
+	}
+}
+
+func TestDataPipelineLogic_DefaultValues(t *testing.T) {
+	logic := DataPipelineLogic{}
+
+	// Empty props should use defaults
+	props := map[string]any{}
+
+	requests := []*Request{{ID: "1", Type: "DATA", LatencyMS: 0}}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected pipeline to be healthy with default values")
+	}
+
+	if len(output) != 0 {
+		t.Error("Expected no immediate output")
+	}
+
+	// Should use default batch size and transformation
+	state, ok := props["_pipelineState"].(PipelineState)
+	if !ok {
+		t.Error("Expected pipeline state to be created with defaults")
+	}
+
+	if len(state.ProcessingQueue) != 1 {
+		t.Error("Expected one batch with default settings")
+	}
+}
+
+func TestDataPipelineLogic_PipelineStats(t *testing.T) {
+	logic := DataPipelineLogic{}
+
+	props := map[string]any{
+		"batchSize":      5.0,
+		"transformation": "filter",
+	}
+
+	// Initial stats should be empty
+	stats := logic.GetPipelineStats(props)
+	if stats["completedBatches"] != 0 {
+		t.Error("Expected initial completed batches to be 0")
+	}
+
+	// Process some data
+	requests := make([]*Request, 10)
+	for i := range requests {
+		requests[i] = &Request{ID: string(rune('1' + i)), Type: "DATA", LatencyMS: 0}
+	}
+
+	logic.Tick(props, requests, 1)
+
+	// Check stats after processing
+	stats = logic.GetPipelineStats(props)
+	if stats["queuedBatches"] != 2 {
+		t.Errorf("Expected 2 queued batches, got %v", stats["queuedBatches"])
+	}
+
+	if stats["backlogSize"] != 10 {
+		t.Errorf("Expected backlog size of 10, got %v", stats["backlogSize"])
+	}
+}
+
+func TestDataPipelineLogic_ContinuousProcessing(t *testing.T) {
+	logic := DataPipelineLogic{}
+
+	props := map[string]any{
+		"batchSize":      5.0,
+		"transformation": "map",
+	}
+
+	// Process multiple waves of data
+	totalOutput := 0
+
+	for wave := 0; wave < 3; wave++ {
+		requests := make([]*Request, 5)
+		for i := range requests {
+			requests[i] = &Request{ID: string(rune('A' + wave*5 + i)), Type: "DATA", LatencyMS: 0}
+		}
+
+		// Process each wave
+		for tick := wave*10 + 1; tick <= wave*10+5; tick++ {
+			var output []*Request
+			if tick == wave*10+1 {
+				output, _ = logic.Tick(props, requests, tick)
+			} else {
+				output, _ = logic.Tick(props, []*Request{}, tick)
+			}
+			totalOutput += len(output)
+		}
+	}
+
+	// Should have processed all data eventually
+	if totalOutput != 15 {
+		t.Errorf("Expected 15 total output records, got %d", totalOutput)
+	}
+
+	// Check final stats
+	stats := logic.GetPipelineStats(props)
+	if stats["totalRecords"] != 15 {
+		t.Errorf("Expected 15 total records processed, got %v", stats["totalRecords"])
+	}
+}
+
+func TestDataPipelineLogic_EmptyQueue(t *testing.T) {
+	logic := DataPipelineLogic{}
+
+	props := map[string]any{
+		"batchSize":      10.0,
+		"transformation": "map",
+	}
+
+	// Process empty queue
+	output, healthy := logic.Tick(props, []*Request{}, 1)
+
+	if !healthy {
+		t.Error("Expected pipeline to be healthy with empty queue")
+	}
+
+	if len(output) != 0 {
+		t.Error("Expected no output with empty queue")
+	}
+
+	// State should be initialized but empty
+	state, ok := props["_pipelineState"].(PipelineState)
+	if !ok {
+		t.Error("Expected pipeline state to be initialized")
+	}
+
+	if len(state.ProcessingQueue) != 0 {
+		t.Error("Expected empty processing queue")
+	}
+}
--- a/internal/simulation/engine.go
+++ b/internal/simulation/engine.go
@ -185,6 +185,20 @@ func GetLogicForType(t string) NodeLogic {
				@@ -185,6 +185,20 @@ func GetLogicForType(t string) NodeLogic {
 		return LoadBalancerLogic{}
 	case "cdn":
 		return CDNLogic{}
+	case "database":
+		return DatabaseLogic{}
+	case "cache":
+		return CacheLogic{}
+	case "messageQueue":
+		return MessageQueueLogic{}
+	case "microservice":
+		return MicroserviceLogic{}
+	case "monitoring/alerting":
+		return MonitoringLogic{}
+	case "third party service":
+		return ThirdPartyServiceLogic{}
+	case "data pipeline":
+		return DataPipelineLogic{}
 	default:
 		return nil
 	}
--- a/internal/simulation/engine_test.go
+++ b/internal/simulation/engine_test.go
@ -1,6 +1,8 @@
				@@ -1,6 +1,8 @@
 package simulation

 import (
+	"encoding/json"
+	"os"
 	"testing"

 	"systemdesigngame/internal/design"
@ -10,8 +12,8 @@ import (
				@@ -10,8 +12,8 @@ import (
 func TestSimpleChainSimulation(t *testing.T) {
 	d := design.Design{
 		Nodes: []design.Node{
-			{ID: "a", Type: "webserver", Props: map[string]any{"capacityRPS": 1, "baseLatencyMs": 10}},
-			{ID: "b", Type: "webserver", Props: map[string]any{"capacityRPS": 1, "baseLatencyMs": 10}},
+			{ID: "a", Type: "webserver", Props: map[string]any{"rpsCapacity": 1, "baseLatencyMs": 10}},
+			{ID: "b", Type: "webserver", Props: map[string]any{"rpsCapacity": 1, "baseLatencyMs": 10}},
 		},
 		Connections: []design.Connection{
 			{Source: "a", Target: "b"},
@ -49,8 +51,8 @@ func TestSimpleChainSimulation(t *testing.T) {
				@@ -49,8 +51,8 @@ func TestSimpleChainSimulation(t *testing.T) {
 func TestSingleTickRouting(t *testing.T) {
 	d := design.Design{
 		Nodes: []design.Node{
-			{ID: "a", Type: "webserver", Props: map[string]any{"capacityRPS": 1.0, "baseLatencyMs": 10.0}},
-			{ID: "b", Type: "webserver", Props: map[string]any{"capacityRPS": 1.0, "baseLatencyMs": 10.0}},
+			{ID: "a", Type: "webserver", Props: map[string]any{"rpsCapacity": 1.0, "baseLatencyMs": 10.0}},
+			{ID: "b", Type: "webserver", Props: map[string]any{"rpsCapacity": 1.0, "baseLatencyMs": 10.0}},
 		},
 		Connections: []design.Connection{
 			{Source: "a", Target: "b"},
@ -85,7 +87,7 @@ func TestSingleTickRouting(t *testing.T) {
				@@ -85,7 +87,7 @@ func TestSingleTickRouting(t *testing.T) {
 func TestHighRPSSimulation(t *testing.T) {
 	d := design.Design{
 		Nodes: []design.Node{
-			{ID: "entry", Type: "webserver", Props: map[string]any{"capacityRPS": 5000, "baseLatencyMs": 1}},
+			{ID: "entry", Type: "webserver", Props: map[string]any{"rpsCapacity": 5000, "baseLatencyMs": 1}},
 		},
 		Connections: []design.Connection{},
 	}
@ -106,3 +108,854 @@ func TestHighRPSSimulation(t *testing.T) {
				@@ -106,3 +108,854 @@ func TestHighRPSSimulation(t *testing.T) {
 		t.Errorf("expected %d total emitted requests, got %d", expected, totalEmitted)
 	}
 }
+
+func TestDatabaseIntegration(t *testing.T) {
+	design := design.Design{
+		Nodes: []design.Node{
+			{
+				ID:   "webserver",
+				Type: "webserver",
+				Props: map[string]interface{}{
+					"rpsCapacity": 10,
+				},
+			},
+			{
+				ID:   "database",
+				Type: "database",
+				Props: map[string]interface{}{
+					"replication":   2,
+					"maxRPS":        100,
+					"baseLatencyMs": 20,
+				},
+			},
+		},
+		Connections: []design.Connection{
+			{
+				Source: "webserver",
+				Target: "database",
+			},
+		},
+	}
+
+	engine := NewEngineFromDesign(design, 100)
+	engine.RPS = 5
+	engine.EntryNode = "webserver"
+
+	snapshots := engine.Run(3, 100)
+
+	if len(snapshots) != 3 {
+		t.Errorf("Expected 3 snapshots, got %d", len(snapshots))
+	}
+
+	// Verify database node exists and is healthy
+	if len(engine.Nodes) != 2 {
+		t.Errorf("Expected 2 nodes (webserver + database), got %d", len(engine.Nodes))
+	}
+
+	dbNode, exists := engine.Nodes["database"]
+	if !exists {
+		t.Errorf("Database node should exist in simulation")
+	}
+
+	if !dbNode.Alive {
+		t.Errorf("Database node should be alive")
+	}
+
+	if dbNode.Type != "database" {
+		t.Errorf("Expected database type, got %s", dbNode.Type)
+	}
+}
+
+func TestCacheIntegration(t *testing.T) {
+	design := design.Design{
+		Nodes: []design.Node{
+			{
+				ID:   "webserver",
+				Type: "webserver",
+				Props: map[string]interface{}{
+					"rpsCapacity": 10,
+				},
+			},
+			{
+				ID:   "cache",
+				Type: "cache",
+				Props: map[string]interface{}{
+					"cacheTTL":       5000,
+					"maxEntries":     50,
+					"evictionPolicy": "LRU",
+				},
+			},
+			{
+				ID:   "database",
+				Type: "database",
+				Props: map[string]interface{}{
+					"replication":   1,
+					"maxRPS":        100,
+					"baseLatencyMs": 15,
+				},
+			},
+		},
+		Connections: []design.Connection{
+			{
+				Source: "webserver",
+				Target: "cache",
+			},
+			{
+				Source: "cache",
+				Target: "database",
+			},
+		},
+	}
+
+	engine := NewEngineFromDesign(design, 100)
+	engine.RPS = 5
+	engine.EntryNode = "webserver"
+
+	snapshots := engine.Run(5, 100)
+
+	if len(snapshots) != 5 {
+		t.Errorf("Expected 5 snapshots, got %d", len(snapshots))
+	}
+
+	// Verify all nodes exist and are healthy
+	if len(engine.Nodes) != 3 {
+		t.Errorf("Expected 3 nodes (webserver + cache + database), got %d", len(engine.Nodes))
+	}
+
+	cacheNode, exists := engine.Nodes["cache"]
+	if !exists {
+		t.Errorf("Cache node should exist in simulation")
+	}
+
+	if !cacheNode.Alive {
+		t.Errorf("Cache node should be alive")
+	}
+
+	if cacheNode.Type != "cache" {
+		t.Errorf("Expected cache type, got %s", cacheNode.Type)
+	}
+
+	// Verify cache has internal state
+	cacheData, ok := cacheNode.Props["_cacheData"]
+	if !ok {
+		t.Errorf("Cache should have internal _cacheData state")
+	}
+
+	// Cache data should be a map
+	if _, ok := cacheData.(map[string]*CacheEntry); !ok {
+		t.Errorf("Cache data should be map[string]*CacheEntry")
+	}
+}
+
+func TestMessageQueueIntegration(t *testing.T) {
+	design := design.Design{
+		Nodes: []design.Node{
+			{
+				ID:   "producer",
+				Type: "webserver",
+				Props: map[string]interface{}{
+					"rpsCapacity": 10,
+				},
+			},
+			{
+				ID:   "messagequeue",
+				Type: "messageQueue",
+				Props: map[string]interface{}{
+					"queueCapacity":    50,
+					"retentionSeconds": 3600,
+					"processingRate":   5,
+				},
+			},
+			{
+				ID:   "consumer",
+				Type: "webserver",
+				Props: map[string]interface{}{
+					"rpsCapacity": 20,
+				},
+			},
+		},
+		Connections: []design.Connection{
+			{
+				Source: "producer",
+				Target: "messagequeue",
+			},
+			{
+				Source: "messagequeue",
+				Target: "consumer",
+			},
+		},
+	}
+
+	engine := NewEngineFromDesign(design, 100)
+	engine.RPS = 3
+	engine.EntryNode = "producer"
+
+	snapshots := engine.Run(5, 100)
+
+	if len(snapshots) != 5 {
+		t.Errorf("Expected 5 snapshots, got %d", len(snapshots))
+	}
+
+	// Verify all nodes exist and are healthy
+	if len(engine.Nodes) != 3 {
+		t.Errorf("Expected 3 nodes (producer + queue + consumer), got %d", len(engine.Nodes))
+	}
+
+	queueNode, exists := engine.Nodes["messagequeue"]
+	if !exists {
+		t.Errorf("Message queue node should exist in simulation")
+	}
+
+	if !queueNode.Alive {
+		t.Errorf("Message queue node should be alive")
+	}
+
+	if queueNode.Type != "messageQueue" {
+		t.Errorf("Expected messageQueue type, got %s", queueNode.Type)
+	}
+
+	// Verify queue has internal state
+	messageQueue, ok := queueNode.Props["_messageQueue"]
+	if !ok {
+		t.Errorf("Message queue should have internal _messageQueue state")
+	}
+
+	// Message queue should be a slice
+	if _, ok := messageQueue.([]QueuedMessage); !ok {
+		t.Errorf("Message queue should be []QueuedMessage")
+	}
+}
+
+func TestMicroserviceIntegration(t *testing.T) {
+	// Load the microservice design
+	designData, err := os.ReadFile("testdata/microservice_design.json")
+	if err != nil {
+		t.Fatalf("Failed to read microservice design: %v", err)
+	}
+
+	var d design.Design
+	if err := json.Unmarshal(designData, &d); err != nil {
+		t.Fatalf("Failed to unmarshal design: %v", err)
+	}
+
+	// Create engine
+	engine := NewEngineFromDesign(d, 100)
+	if engine == nil {
+		t.Fatalf("Failed to create engine from microservice design")
+	}
+
+	// Set up simulation parameters
+	engine.RPS = 30
+	engine.EntryNode = "webserver-1"
+
+	// Run simulation for 5 ticks
+	snapshots := engine.Run(5, 100)
+
+	if len(snapshots) != 5 {
+		t.Errorf("Expected 5 snapshots, got %d", len(snapshots))
+	}
+
+	// Verify microservice nodes exist and are configured correctly
+	userService, exists := engine.Nodes["microservice-1"]
+	if !exists {
+		t.Errorf("User service microservice node should exist")
+	}
+
+	if !userService.Alive {
+		t.Errorf("User service should be alive")
+	}
+
+	if userService.Type != "microservice" {
+		t.Errorf("Expected microservice type, got %s", userService.Type)
+	}
+
+	orderService, exists := engine.Nodes["microservice-2"]
+	if !exists {
+		t.Errorf("Order service microservice node should exist")
+	}
+
+	if !orderService.Alive {
+		t.Errorf("Order service should be alive")
+	}
+
+	// Verify auto-scaling properties are preserved
+	userServiceInstanceCount := userService.Props["instanceCount"]
+	if userServiceInstanceCount == nil {
+		t.Errorf("User service should have instanceCount property")
+	}
+
+	// Verify different scaling strategies
+	userScalingStrategy := userService.Props["scalingStrategy"]
+	if userScalingStrategy != "auto" {
+		t.Errorf("Expected auto scaling strategy for user service, got %v", userScalingStrategy)
+	}
+
+	orderScalingStrategy := orderService.Props["scalingStrategy"]
+	if orderScalingStrategy != "manual" {
+		t.Errorf("Expected manual scaling strategy for order service, got %v", orderScalingStrategy)
+	}
+
+	// Verify resource configurations
+	userCPU := userService.Props["cpu"]
+	if userCPU != 4.0 {
+		t.Errorf("Expected user service to have 4 CPU cores, got %v", userCPU)
+	}
+
+	orderRAM := orderService.Props["ramGb"]
+	if orderRAM != 4.0 {
+		t.Errorf("Expected order service to have 4GB RAM, got %v", orderRAM)
+	}
+
+	// Check that microservices processed requests through the simulation
+	lastSnapshot := snapshots[len(snapshots)-1]
+	if len(lastSnapshot.QueueSizes) == 0 {
+		t.Errorf("Expected queue sizes to be tracked in snapshots")
+	}
+
+	// Verify load balancer connected to microservices
+	loadBalancer, exists := engine.Nodes["lb-1"]
+	if !exists {
+		t.Errorf("Load balancer should exist")
+	}
+
+	if !loadBalancer.Alive {
+		t.Errorf("Load balancer should be alive")
+	}
+
+	// Verify database connection exists
+	database, exists := engine.Nodes["db-1"]
+	if !exists {
+		t.Errorf("Database should exist")
+	}
+
+	if !database.Alive {
+		t.Errorf("Database should be alive")
+	}
+}
+
+func TestMonitoringIntegration(t *testing.T) {
+	// Load the monitoring design
+	designData, err := os.ReadFile("testdata/monitoring_design.json")
+	if err != nil {
+		t.Fatalf("Failed to read monitoring design: %v", err)
+	}
+
+	var d design.Design
+	if err := json.Unmarshal(designData, &d); err != nil {
+		t.Fatalf("Failed to unmarshal design: %v", err)
+	}
+
+	// Create engine
+	engine := NewEngineFromDesign(d, 100)
+	if engine == nil {
+		t.Fatalf("Failed to create engine from monitoring design")
+	}
+
+	// Set up simulation parameters
+	engine.RPS = 20
+	engine.EntryNode = "webserver-1"
+
+	// Run simulation for 10 ticks to allow metrics collection
+	snapshots := engine.Run(10, 100)
+
+	if len(snapshots) != 10 {
+		t.Errorf("Expected 10 snapshots, got %d", len(snapshots))
+	}
+
+	// Verify monitoring nodes exist and are configured correctly
+	monitor1, exists := engine.Nodes["monitor-1"]
+	if !exists {
+		t.Errorf("Latency monitor node should exist")
+	}
+
+	if !monitor1.Alive {
+		t.Errorf("Latency monitor should be alive")
+	}
+
+	if monitor1.Type != "monitoring/alerting" {
+		t.Errorf("Expected monitoring/alerting type, got %s", monitor1.Type)
+	}
+
+	monitor2, exists := engine.Nodes["monitor-2"]
+	if !exists {
+		t.Errorf("Error rate monitor node should exist")
+	}
+
+	if !monitor2.Alive {
+		t.Errorf("Error rate monitor should be alive")
+	}
+
+	// Verify monitoring properties are preserved
+	tool1 := monitor1.Props["tool"]
+	if tool1 != "Prometheus" {
+		t.Errorf("Expected Prometheus tool for monitor-1, got %v", tool1)
+	}
+
+	tool2 := monitor2.Props["tool"]
+	if tool2 != "Datadog" {
+		t.Errorf("Expected Datadog tool for monitor-2, got %v", tool2)
+	}
+
+	alertMetric1 := monitor1.Props["alertMetric"]
+	if alertMetric1 != "latency" {
+		t.Errorf("Expected latency alert metric for monitor-1, got %v", alertMetric1)
+	}
+
+	alertMetric2 := monitor2.Props["alertMetric"]
+	if alertMetric2 != "error_rate" {
+		t.Errorf("Expected error_rate alert metric for monitor-2, got %v", alertMetric2)
+	}
+
+	// Check that metrics were collected during simulation
+	metrics1, ok := monitor1.Props["_metrics"]
+	if !ok {
+		t.Errorf("Expected monitor-1 to have collected metrics")
+	}
+
+	if metrics1 == nil {
+		t.Errorf("Expected monitor-1 metrics to be non-nil")
+	}
+
+	// Check alert count tracking
+	alertCount1, ok := monitor1.Props["_alertCount"]
+	if !ok {
+		t.Errorf("Expected monitor-1 to track alert count")
+	}
+
+	if alertCount1 == nil {
+		t.Errorf("Expected monitor-1 alert count to be tracked")
+	}
+
+	// Verify other components in the chain
+	webserver, exists := engine.Nodes["webserver-1"]
+	if !exists {
+		t.Errorf("Web server should exist")
+	}
+
+	if !webserver.Alive {
+		t.Errorf("Web server should be alive")
+	}
+
+	loadBalancer, exists := engine.Nodes["lb-1"]
+	if !exists {
+		t.Errorf("Load balancer should exist")
+	}
+
+	if !loadBalancer.Alive {
+		t.Errorf("Load balancer should be alive")
+	}
+
+	// Verify microservices
+	userService, exists := engine.Nodes["microservice-1"]
+	if !exists {
+		t.Errorf("User service should exist")
+	}
+
+	if !userService.Alive {
+		t.Errorf("User service should be alive")
+	}
+
+	orderService, exists := engine.Nodes["microservice-2"]
+	if !exists {
+		t.Errorf("Order service should exist")
+	}
+
+	if !orderService.Alive {
+		t.Errorf("Order service should be alive")
+	}
+
+	// Verify database
+	database, exists := engine.Nodes["db-1"]
+	if !exists {
+		t.Errorf("Database should exist")
+	}
+
+	if !database.Alive {
+		t.Errorf("Database should be alive")
+	}
+
+	// Check that requests flowed through the monitoring chain
+	lastSnapshot := snapshots[len(snapshots)-1]
+	if len(lastSnapshot.QueueSizes) == 0 {
+		t.Errorf("Expected queue sizes to be tracked in snapshots")
+	}
+
+	// Verify monitoring nodes processed requests
+	if lastSnapshot.NodeHealth["monitor-1"] != true {
+		t.Errorf("Expected monitor-1 to be healthy in final snapshot")
+	}
+
+	if lastSnapshot.NodeHealth["monitor-2"] != true {
+		t.Errorf("Expected monitor-2 to be healthy in final snapshot")
+	}
+}
+
+func TestThirdPartyServiceIntegration(t *testing.T) {
+	// Load the third party service design
+	designData, err := os.ReadFile("testdata/thirdpartyservice_design.json")
+	if err != nil {
+		t.Fatalf("Failed to read third party service design: %v", err)
+	}
+
+	var d design.Design
+	if err := json.Unmarshal(designData, &d); err != nil {
+		t.Fatalf("Failed to unmarshal design: %v", err)
+	}
+
+	// Create engine
+	engine := NewEngineFromDesign(d, 100)
+	if engine == nil {
+		t.Fatalf("Failed to create engine from third party service design")
+	}
+
+	// Set up simulation parameters
+	engine.RPS = 10 // Lower RPS to reduce chance of random failures affecting health
+	engine.EntryNode = "webserver-1"
+
+	// Run simulation for 5 ticks (shorter run to reduce random failure impact)
+	snapshots := engine.Run(5, 100)
+
+	if len(snapshots) != 5 {
+		t.Errorf("Expected 5 snapshots, got %d", len(snapshots))
+	}
+
+	// Verify third party service nodes exist and are configured correctly
+	stripeService, exists := engine.Nodes["stripe-service"]
+	if !exists {
+		t.Errorf("Stripe service node should exist")
+	}
+
+	if stripeService.Type != "third party service" {
+		t.Errorf("Expected third party service type, got %s", stripeService.Type)
+	}
+
+	twilioService, exists := engine.Nodes["twilio-service"]
+	if !exists {
+		t.Errorf("Twilio service node should exist")
+	}
+
+	sendgridService, exists := engine.Nodes["sendgrid-service"]
+	if !exists {
+		t.Errorf("SendGrid service node should exist")
+	}
+
+	slackService, exists := engine.Nodes["slack-service"]
+	if !exists {
+		t.Errorf("Slack service node should exist")
+	}
+
+	// Note: We don't check if services are alive here because the random failure
+	// simulation can cause services to go down, which is realistic behavior
+
+	// Verify provider configurations are preserved
+	stripeProvider := stripeService.Props["provider"]
+	if stripeProvider != "Stripe" {
+		t.Errorf("Expected Stripe provider, got %v", stripeProvider)
+	}
+
+	twilioProvider := twilioService.Props["provider"]
+	if twilioProvider != "Twilio" {
+		t.Errorf("Expected Twilio provider, got %v", twilioProvider)
+	}
+
+	sendgridProvider := sendgridService.Props["provider"]
+	if sendgridProvider != "SendGrid" {
+		t.Errorf("Expected SendGrid provider, got %v", sendgridProvider)
+	}
+
+	slackProvider := slackService.Props["provider"]
+	if slackProvider != "Slack" {
+		t.Errorf("Expected Slack provider, got %v", slackProvider)
+	}
+
+	// Verify latency configurations
+	stripeLatency := stripeService.Props["latency"]
+	if stripeLatency != 180.0 {
+		t.Errorf("Expected Stripe latency 180, got %v", stripeLatency)
+	}
+
+	twilioLatency := twilioService.Props["latency"]
+	if twilioLatency != 250.0 {
+		t.Errorf("Expected Twilio latency 250, got %v", twilioLatency)
+	}
+
+	// Check that service status was initialized and tracked
+	stripeStatus, ok := stripeService.Props["_serviceStatus"]
+	if !ok {
+		t.Errorf("Expected Stripe service status to be tracked")
+	}
+
+	if stripeStatus == nil {
+		t.Errorf("Expected Stripe service status to be non-nil")
+	}
+
+	// Verify other components in the chain
+	webserver, exists := engine.Nodes["webserver-1"]
+	if !exists {
+		t.Errorf("Web server should exist")
+	}
+
+	if !webserver.Alive {
+		t.Errorf("Web server should be alive")
+	}
+
+	// Verify microservices
+	paymentService, exists := engine.Nodes["microservice-1"]
+	if !exists {
+		t.Errorf("Payment service should exist")
+	}
+
+	if !paymentService.Alive {
+		t.Errorf("Payment service should be alive")
+	}
+
+	notificationService, exists := engine.Nodes["microservice-2"]
+	if !exists {
+		t.Errorf("Notification service should exist")
+	}
+
+	if !notificationService.Alive {
+		t.Errorf("Notification service should be alive")
+	}
+
+	// Verify monitoring and database
+	monitor, exists := engine.Nodes["monitor-1"]
+	if !exists {
+		t.Errorf("Monitor should exist")
+	}
+
+	if !monitor.Alive {
+		t.Errorf("Monitor should be alive")
+	}
+
+	database, exists := engine.Nodes["db-1"]
+	if !exists {
+		t.Errorf("Database should exist")
+	}
+
+	if !database.Alive {
+		t.Errorf("Database should be alive")
+	}
+
+	// Check that requests flowed through the third party services
+	lastSnapshot := snapshots[len(snapshots)-1]
+	if len(lastSnapshot.QueueSizes) == 0 {
+		t.Errorf("Expected queue sizes to be tracked in snapshots")
+	}
+
+	// Verify third party services are being tracked in snapshots
+	// Note: We don't assert health status because random failures are realistic
+	_, stripeHealthTracked := lastSnapshot.NodeHealth["stripe-service"]
+	if !stripeHealthTracked {
+		t.Errorf("Expected Stripe service health to be tracked in snapshots")
+	}
+
+	_, twilioHealthTracked := lastSnapshot.NodeHealth["twilio-service"]
+	if !twilioHealthTracked {
+		t.Errorf("Expected Twilio service health to be tracked in snapshots")
+	}
+
+	_, sendgridHealthTracked := lastSnapshot.NodeHealth["sendgrid-service"]
+	if !sendgridHealthTracked {
+		t.Errorf("Expected SendGrid service health to be tracked in snapshots")
+	}
+
+	_, slackHealthTracked := lastSnapshot.NodeHealth["slack-service"]
+	if !slackHealthTracked {
+		t.Errorf("Expected Slack service health to be tracked in snapshots")
+	}
+}
+
+func TestDataPipelineIntegration(t *testing.T) {
+	// Load the data pipeline design
+	designData, err := os.ReadFile("testdata/datapipeline_design.json")
+	if err != nil {
+		t.Fatalf("Failed to read data pipeline design: %v", err)
+	}
+
+	var d design.Design
+	if err := json.Unmarshal(designData, &d); err != nil {
+		t.Fatalf("Failed to unmarshal design: %v", err)
+	}
+
+	// Create engine
+	engine := NewEngineFromDesign(d, 100)
+	if engine == nil {
+		t.Fatalf("Failed to create engine from data pipeline design")
+	}
+
+	// Set up simulation parameters
+	engine.RPS = 20
+	engine.EntryNode = "data-source"
+
+	// Run simulation for 10 ticks to test data pipeline processing
+	snapshots := engine.Run(10, 100)
+
+	if len(snapshots) != 10 {
+		t.Errorf("Expected 10 snapshots, got %d", len(snapshots))
+	}
+
+	// Verify data pipeline nodes exist and are configured correctly
+	etlPipeline1, exists := engine.Nodes["etl-pipeline-1"]
+	if !exists {
+		t.Errorf("ETL Pipeline 1 node should exist")
+	}
+
+	if etlPipeline1.Type != "data pipeline" {
+		t.Errorf("Expected data pipeline type, got %s", etlPipeline1.Type)
+	}
+
+	etlPipeline2, exists := engine.Nodes["etl-pipeline-2"]
+	if !exists {
+		t.Errorf("ETL Pipeline 2 node should exist")
+	}
+
+	mlPipeline, exists := engine.Nodes["ml-pipeline"]
+	if !exists {
+		t.Errorf("ML Pipeline node should exist")
+	}
+
+	analyticsPipeline, exists := engine.Nodes["analytics-pipeline"]
+	if !exists {
+		t.Errorf("Analytics Pipeline node should exist")
+	}
+
+	compressionPipeline, exists := engine.Nodes["compression-pipeline"]
+	if !exists {
+		t.Errorf("Compression Pipeline node should exist")
+	}
+
+	// Verify pipeline configurations are preserved
+	etl1BatchSize := etlPipeline1.Props["batchSize"]
+	if etl1BatchSize != 100.0 {
+		t.Errorf("Expected ETL Pipeline 1 batch size 100, got %v", etl1BatchSize)
+	}
+
+	etl1Transformation := etlPipeline1.Props["transformation"]
+	if etl1Transformation != "validate" {
+		t.Errorf("Expected validate transformation, got %v", etl1Transformation)
+	}
+
+	etl2BatchSize := etlPipeline2.Props["batchSize"]
+	if etl2BatchSize != 50.0 {
+		t.Errorf("Expected ETL Pipeline 2 batch size 50, got %v", etl2BatchSize)
+	}
+
+	etl2Transformation := etlPipeline2.Props["transformation"]
+	if etl2Transformation != "aggregate" {
+		t.Errorf("Expected aggregate transformation, got %v", etl2Transformation)
+	}
+
+	mlTransformation := mlPipeline.Props["transformation"]
+	if mlTransformation != "enrich" {
+		t.Errorf("Expected enrich transformation for ML pipeline, got %v", mlTransformation)
+	}
+
+	analyticsTransformation := analyticsPipeline.Props["transformation"]
+	if analyticsTransformation != "join" {
+		t.Errorf("Expected join transformation for analytics pipeline, got %v", analyticsTransformation)
+	}
+
+	compressionTransformation := compressionPipeline.Props["transformation"]
+	if compressionTransformation != "compress" {
+		t.Errorf("Expected compress transformation, got %v", compressionTransformation)
+	}
+
+	// Check that pipeline state was initialized and tracked
+	etl1State, ok := etlPipeline1.Props["_pipelineState"]
+	if !ok {
+		t.Errorf("Expected ETL Pipeline 1 to have pipeline state")
+	}
+
+	if etl1State == nil {
+		t.Errorf("Expected ETL Pipeline 1 state to be non-nil")
+	}
+
+	// Verify other components in the data flow
+	dataSource, exists := engine.Nodes["data-source"]
+	if !exists {
+		t.Errorf("Data source should exist")
+	}
+
+	if !dataSource.Alive {
+		t.Errorf("Data source should be alive")
+	}
+
+	rawDataQueue, exists := engine.Nodes["raw-data-queue"]
+	if !exists {
+		t.Errorf("Raw data queue should exist")
+	}
+
+	if !rawDataQueue.Alive {
+		t.Errorf("Raw data queue should be alive")
+	}
+
+	// Verify storage components
+	cache, exists := engine.Nodes["cache-1"]
+	if !exists {
+		t.Errorf("Feature cache should exist")
+	}
+
+	if !cache.Alive {
+		t.Errorf("Feature cache should be alive")
+	}
+
+	dataWarehouse, exists := engine.Nodes["data-warehouse"]
+	if !exists {
+		t.Errorf("Data warehouse should exist")
+	}
+
+	if !dataWarehouse.Alive {
+		t.Errorf("Data warehouse should be alive")
+	}
+
+	// Verify monitoring
+	monitor, exists := engine.Nodes["monitoring-1"]
+	if !exists {
+		t.Errorf("Pipeline monitor should exist")
+	}
+
+	if !monitor.Alive {
+		t.Errorf("Pipeline monitor should be alive")
+	}
+
+	// Check that data pipelines are being tracked in snapshots
+	lastSnapshot := snapshots[len(snapshots)-1]
+	if len(lastSnapshot.QueueSizes) == 0 {
+		t.Errorf("Expected queue sizes to be tracked in snapshots")
+	}
+
+	// Verify data pipeline health is tracked
+	_, etl1HealthTracked := lastSnapshot.NodeHealth["etl-pipeline-1"]
+	if !etl1HealthTracked {
+		t.Errorf("Expected ETL Pipeline 1 health to be tracked in snapshots")
+	}
+
+	_, etl2HealthTracked := lastSnapshot.NodeHealth["etl-pipeline-2"]
+	if !etl2HealthTracked {
+		t.Errorf("Expected ETL Pipeline 2 health to be tracked in snapshots")
+	}
+
+	_, mlHealthTracked := lastSnapshot.NodeHealth["ml-pipeline"]
+	if !mlHealthTracked {
+		t.Errorf("Expected ML Pipeline health to be tracked in snapshots")
+	}
+
+	_, analyticsHealthTracked := lastSnapshot.NodeHealth["analytics-pipeline"]
+	if !analyticsHealthTracked {
+		t.Errorf("Expected Analytics Pipeline health to be tracked in snapshots")
+	}
+
+	_, compressionHealthTracked := lastSnapshot.NodeHealth["compression-pipeline"]
+	if !compressionHealthTracked {
+		t.Errorf("Expected Compression Pipeline health to be tracked in snapshots")
+	}
+
+	// Verify the data flow chain exists (all components are connected)
+	// This ensures the integration test validates the complete data processing architecture
+	totalNodes := len(engine.Nodes)
+	expectedNodes := 10 // From the design JSON
+	if totalNodes != expectedNodes {
+		t.Errorf("Expected %d total nodes in data pipeline architecture, got %d", expectedNodes, totalNodes)
+	}
+}
--- a/internal/simulation/messagequeue.go
+++ b/internal/simulation/messagequeue.go
@ -0,0 +1,115 @@
				@@ -0,0 +1,115 @@
+package simulation
+
+type MessageQueueLogic struct{}
+
+type QueuedMessage struct {
+	RequestID   string
+	Timestamp   int
+	MessageData string
+	RetryCount  int
+}
+
+func (mq MessageQueueLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
+	// Extract message queue properties
+	queueCapacity := int(AsFloat64(props["queueCapacity"]))
+	if queueCapacity == 0 {
+		queueCapacity = 1000 // default capacity
+	}
+
+	retentionSeconds := int(AsFloat64(props["retentionSeconds"]))
+	if retentionSeconds == 0 {
+		retentionSeconds = 86400 // default 24 hours in seconds
+	}
+
+	// Processing rate (messages per tick)
+	processingRate := int(AsFloat64(props["processingRate"]))
+	if processingRate == 0 {
+		processingRate = 100 // default 100 messages per tick
+	}
+
+	// Current timestamp for this tick
+	currentTime := tick * 100 // assuming 100ms per tick
+
+	// Initialize queue storage in props
+	messageQueue, ok := props["_messageQueue"].([]QueuedMessage)
+	if !ok {
+		messageQueue = []QueuedMessage{}
+	}
+
+	// Clean up expired messages based on retention policy
+	messageQueue = mq.cleanExpiredMessages(messageQueue, currentTime, retentionSeconds*1000)
+
+	// First, process existing messages from the queue (FIFO order)
+	output := []*Request{}
+	messagesToProcess := len(messageQueue)
+	if messagesToProcess > processingRate {
+		messagesToProcess = processingRate
+	}
+
+	for i := 0; i < messagesToProcess; i++ {
+		if len(messageQueue) == 0 {
+			break
+		}
+
+		// Dequeue message (FIFO - take from front)
+		message := messageQueue[0]
+		messageQueue = messageQueue[1:]
+
+		// Create request for downstream processing
+		processedReq := &Request{
+			ID:        message.RequestID,
+			Timestamp: message.Timestamp,
+			LatencyMS: 2, // Small latency for queue processing
+			Origin:    "message-queue",
+			Type:      "PROCESS",
+			Path:      []string{"queued-message"},
+		}
+
+		output = append(output, processedReq)
+	}
+
+	// Then, add incoming requests to the queue for next tick
+	for _, req := range queue {
+		// Check if queue is at capacity
+		if len(messageQueue) >= queueCapacity {
+			// Queue full - message is dropped (or could implement backpressure)
+			// For now, we'll drop the message and add latency penalty
+			reqCopy := *req
+			reqCopy.LatencyMS += 1000 // High latency penalty for dropped messages
+			reqCopy.Path = append(reqCopy.Path, "queue-full-dropped")
+			// Don't add to output as message was dropped
+			continue
+		}
+
+		// Add message to queue
+		message := QueuedMessage{
+			RequestID:   req.ID,
+			Timestamp:   currentTime,
+			MessageData: "message-payload", // In real system, this would be the actual message
+			RetryCount:  0,
+		}
+		messageQueue = append(messageQueue, message)
+	}
+
+	// Update queue storage in props
+	props["_messageQueue"] = messageQueue
+
+	// Queue is healthy if not at capacity or if we can still process messages
+	// Queue becomes unhealthy only when completely full AND we can't process anything
+	healthy := len(messageQueue) < queueCapacity || processingRate > 0
+
+	return output, healthy
+}
+
+func (mq MessageQueueLogic) cleanExpiredMessages(messageQueue []QueuedMessage, currentTime, retentionMs int) []QueuedMessage {
+	cleaned := []QueuedMessage{}
+
+	for _, message := range messageQueue {
+		if (currentTime - message.Timestamp) <= retentionMs {
+			cleaned = append(cleaned, message)
+		}
+		// Expired messages are dropped
+	}
+
+	return cleaned
+}
--- a/internal/simulation/messagequeue_test.go
+++ b/internal/simulation/messagequeue_test.go
@ -0,0 +1,329 @@
				@@ -0,0 +1,329 @@
+package simulation
+
+import (
+	"testing"
+)
+
+func TestMessageQueueLogic_BasicProcessing(t *testing.T) {
+	mq := MessageQueueLogic{}
+
+	props := map[string]any{
+		"queueCapacity":    10,
+		"retentionSeconds": 3600, // 1 hour
+		"processingRate":   5,
+	}
+
+	// Add some messages to the queue
+	reqs := []*Request{
+		{ID: "msg1", Type: "SEND", LatencyMS: 0, Timestamp: 100},
+		{ID: "msg2", Type: "SEND", LatencyMS: 0, Timestamp: 100},
+		{ID: "msg3", Type: "SEND", LatencyMS: 0, Timestamp: 100},
+	}
+
+	output, healthy := mq.Tick(props, reqs, 1)
+
+	if !healthy {
+		t.Errorf("Message queue should be healthy")
+	}
+
+	// No immediate output since messages are queued first
+	if len(output) != 0 {
+		t.Errorf("Expected 0 immediate output (messages queued), got %d", len(output))
+	}
+
+	// Check that messages are in the queue
+	messageQueue, ok := props["_messageQueue"].([]QueuedMessage)
+	if !ok {
+		t.Errorf("Expected message queue to be initialized")
+	}
+
+	if len(messageQueue) != 3 {
+		t.Errorf("Expected 3 messages in queue, got %d", len(messageQueue))
+	}
+
+	// Process the queue (no new incoming messages)
+	output2, _ := mq.Tick(props, []*Request{}, 2)
+
+	// Should process up to processingRate (5) messages
+	if len(output2) != 3 {
+		t.Errorf("Expected 3 processed messages, got %d", len(output2))
+	}
+
+	// Queue should now be empty
+	messageQueue2, _ := props["_messageQueue"].([]QueuedMessage)
+	if len(messageQueue2) != 0 {
+		t.Errorf("Expected empty queue after processing, got %d messages", len(messageQueue2))
+	}
+
+	// Check output message properties
+	for _, msg := range output2 {
+		if msg.LatencyMS != 2 {
+			t.Errorf("Expected 2ms processing latency, got %dms", msg.LatencyMS)
+		}
+		if msg.Type != "PROCESS" {
+			t.Errorf("Expected PROCESS type, got %s", msg.Type)
+		}
+	}
+}
+
+func TestMessageQueueLogic_CapacityLimit(t *testing.T) {
+	mq := MessageQueueLogic{}
+
+	props := map[string]any{
+		"queueCapacity":    2, // Small capacity
+		"retentionSeconds": 3600,
+		"processingRate":   1,
+	}
+
+	// Add more messages than capacity
+	reqs := []*Request{
+		{ID: "msg1", Type: "SEND", LatencyMS: 0},
+		{ID: "msg2", Type: "SEND", LatencyMS: 0},
+		{ID: "msg3", Type: "SEND", LatencyMS: 0}, // This should be dropped
+	}
+
+	output, healthy := mq.Tick(props, reqs, 1)
+
+	// Queue should be healthy (can still process messages)
+	if !healthy {
+		t.Errorf("Queue should be healthy (can still process)")
+	}
+
+	// Should have no immediate output (messages queued)
+	if len(output) != 0 {
+		t.Errorf("Expected 0 immediate output, got %d", len(output))
+	}
+
+	// Check queue size
+	messageQueue, _ := props["_messageQueue"].([]QueuedMessage)
+	if len(messageQueue) != 2 {
+		t.Errorf("Expected 2 messages in queue (capacity limit), got %d", len(messageQueue))
+	}
+
+	// Add another message when queue is full
+	reqs2 := []*Request{{ID: "msg4", Type: "SEND", LatencyMS: 0}}
+	output2, healthy2 := mq.Tick(props, reqs2, 2)
+
+	// Queue should still be healthy (can process messages)
+	if !healthy2 {
+		t.Errorf("Queue should remain healthy (can still process)")
+	}
+
+	// Should have 1 processed message (processingRate = 1)
+	if len(output2) != 1 {
+		t.Errorf("Expected 1 processed message, got %d", len(output2))
+	}
+
+	// Queue should have 2 messages (started with 2, processed 1 leaving 1, added 1 new since space available)
+	messageQueue2, _ := props["_messageQueue"].([]QueuedMessage)
+	if len(messageQueue2) != 2 {
+		t.Errorf("Expected 2 messages in queue (1 remaining + 1 new), got %d", len(messageQueue2))
+	}
+}
+
+func TestMessageQueueLogic_ProcessingRate(t *testing.T) {
+	mq := MessageQueueLogic{}
+
+	props := map[string]any{
+		"queueCapacity":    100,
+		"retentionSeconds": 3600,
+		"processingRate":   3, // Process 3 messages per tick
+	}
+
+	// Add 10 messages
+	reqs := []*Request{}
+	for i := 0; i < 10; i++ {
+		reqs = append(reqs, &Request{ID: "msg" + string(rune(i+'0')), Type: "SEND"})
+	}
+
+	// First tick: queue all messages
+	mq.Tick(props, reqs, 1)
+
+	// Second tick: process at rate limit
+	output, _ := mq.Tick(props, []*Request{}, 2)
+
+	if len(output) != 3 {
+		t.Errorf("Expected 3 processed messages (rate limit), got %d", len(output))
+	}
+
+	// Check remaining queue size
+	messageQueue, _ := props["_messageQueue"].([]QueuedMessage)
+	if len(messageQueue) != 7 {
+		t.Errorf("Expected 7 messages remaining in queue, got %d", len(messageQueue))
+	}
+
+	// Third tick: process 3 more
+	output2, _ := mq.Tick(props, []*Request{}, 3)
+
+	if len(output2) != 3 {
+		t.Errorf("Expected 3 more processed messages, got %d", len(output2))
+	}
+
+	// Check remaining queue size
+	messageQueue2, _ := props["_messageQueue"].([]QueuedMessage)
+	if len(messageQueue2) != 4 {
+		t.Errorf("Expected 4 messages remaining in queue, got %d", len(messageQueue2))
+	}
+}
+
+func TestMessageQueueLogic_MessageRetention(t *testing.T) {
+	mq := MessageQueueLogic{}
+
+	props := map[string]any{
+		"queueCapacity":    100,
+		"retentionSeconds": 1, // 1 second retention
+		"processingRate":   0, // Don't process messages, just test retention
+	}
+
+	// Add messages at tick 1
+	reqs := []*Request{
+		{ID: "msg1", Type: "SEND", Timestamp: 100},
+		{ID: "msg2", Type: "SEND", Timestamp: 100},
+	}
+
+	mq.Tick(props, reqs, 1)
+
+	// Check messages are queued
+	messageQueue, _ := props["_messageQueue"].([]QueuedMessage)
+	if len(messageQueue) != 2 {
+		t.Errorf("Expected 2 messages in queue, got %d", len(messageQueue))
+	}
+
+	// Tick at time that should expire messages (tick 20 = 2000ms, retention = 1000ms)
+	output, _ := mq.Tick(props, []*Request{}, 20)
+
+	// Messages should be expired and removed
+	messageQueue2, _ := props["_messageQueue"].([]QueuedMessage)
+	if len(messageQueue2) != 0 {
+		t.Errorf("Expected messages to be expired and removed, got %d", len(messageQueue2))
+	}
+
+	// No output since processingRate = 0
+	if len(output) != 0 {
+		t.Errorf("Expected no output with processingRate=0, got %d", len(output))
+	}
+}
+
+func TestMessageQueueLogic_FIFOOrdering(t *testing.T) {
+	mq := MessageQueueLogic{}
+
+	props := map[string]any{
+		"queueCapacity":    10,
+		"retentionSeconds": 3600,
+		"processingRate":   2,
+	}
+
+	// Add messages in order
+	reqs := []*Request{
+		{ID: "first", Type: "SEND"},
+		{ID: "second", Type: "SEND"},
+		{ID: "third", Type: "SEND"},
+	}
+
+	mq.Tick(props, reqs, 1)
+
+	// Process 2 messages
+	output, _ := mq.Tick(props, []*Request{}, 2)
+
+	if len(output) != 2 {
+		t.Errorf("Expected 2 processed messages, got %d", len(output))
+	}
+
+	// Check FIFO order
+	if output[0].ID != "first" {
+		t.Errorf("Expected first message to be 'first', got '%s'", output[0].ID)
+	}
+
+	if output[1].ID != "second" {
+		t.Errorf("Expected second message to be 'second', got '%s'", output[1].ID)
+	}
+
+	// Process remaining message
+	output2, _ := mq.Tick(props, []*Request{}, 3)
+
+	if len(output2) != 1 {
+		t.Errorf("Expected 1 remaining message, got %d", len(output2))
+	}
+
+	if output2[0].ID != "third" {
+		t.Errorf("Expected remaining message to be 'third', got '%s'", output2[0].ID)
+	}
+}
+
+func TestMessageQueueLogic_DefaultValues(t *testing.T) {
+	mq := MessageQueueLogic{}
+
+	// Empty props should use defaults
+	props := map[string]any{}
+
+	reqs := []*Request{{ID: "msg1", Type: "SEND"}}
+	output, healthy := mq.Tick(props, reqs, 1)
+
+	if !healthy {
+		t.Errorf("Queue should be healthy with default values")
+	}
+
+	// Should queue the message (no immediate output)
+	if len(output) != 0 {
+		t.Errorf("Expected message to be queued (0 output), got %d", len(output))
+	}
+
+	// Check that message was queued with defaults
+	messageQueue, _ := props["_messageQueue"].([]QueuedMessage)
+	if len(messageQueue) != 1 {
+		t.Errorf("Expected 1 message queued with defaults, got %d", len(messageQueue))
+	}
+
+	// Process with defaults (should process up to default rate)
+	output2, _ := mq.Tick(props, []*Request{}, 2)
+
+	if len(output2) != 1 {
+		t.Errorf("Expected 1 processed message with defaults, got %d", len(output2))
+	}
+}
+
+func TestMessageQueueLogic_ContinuousFlow(t *testing.T) {
+	mq := MessageQueueLogic{}
+
+	props := map[string]any{
+		"queueCapacity":    5,
+		"retentionSeconds": 3600,
+		"processingRate":   2,
+	}
+
+	// Tick 1: Add 3 messages
+	reqs1 := []*Request{
+		{ID: "msg1", Type: "SEND"},
+		{ID: "msg2", Type: "SEND"},
+		{ID: "msg3", Type: "SEND"},
+	}
+	output1, _ := mq.Tick(props, reqs1, 1)
+
+	// Should queue all 3 messages
+	if len(output1) != 0 {
+		t.Errorf("Expected 0 output on first tick, got %d", len(output1))
+	}
+
+	// Tick 2: Add 2 more messages, process 2
+	reqs2 := []*Request{
+		{ID: "msg4", Type: "SEND"},
+		{ID: "msg5", Type: "SEND"},
+	}
+	output2, _ := mq.Tick(props, reqs2, 2)
+
+	// Should process 2 messages
+	if len(output2) != 2 {
+		t.Errorf("Expected 2 processed messages, got %d", len(output2))
+	}
+
+	// Should have 3 messages in queue (3 remaining + 2 new - 2 processed)
+	messageQueue, _ := props["_messageQueue"].([]QueuedMessage)
+	if len(messageQueue) != 3 {
+		t.Errorf("Expected 3 messages in queue, got %d", len(messageQueue))
+	}
+
+	// Check processing order
+	if output2[0].ID != "msg1" || output2[1].ID != "msg2" {
+		t.Errorf("Expected FIFO processing order, got %s, %s", output2[0].ID, output2[1].ID)
+	}
+}
--- a/internal/simulation/microservice.go
+++ b/internal/simulation/microservice.go
@ -0,0 +1,162 @@
				@@ -0,0 +1,162 @@
+package simulation
+
+import "math"
+
+type MicroserviceLogic struct{}
+
+type ServiceInstance struct {
+	ID           int
+	CurrentLoad  int
+	HealthStatus string
+}
+
+func (m MicroserviceLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
+	// Extract microservice properties
+	instanceCount := int(AsFloat64(props["instanceCount"]))
+	if instanceCount == 0 {
+		instanceCount = 1 // default to 1 instance
+	}
+
+	cpu := int(AsFloat64(props["cpu"]))
+	if cpu == 0 {
+		cpu = 2 // default 2 CPU cores
+	}
+
+	ramGb := int(AsFloat64(props["ramGb"]))
+	if ramGb == 0 {
+		ramGb = 4 // default 4GB RAM
+	}
+
+	rpsCapacity := int(AsFloat64(props["rpsCapacity"]))
+	if rpsCapacity == 0 {
+		rpsCapacity = 100 // default capacity per instance
+	}
+
+	scalingStrategy := AsString(props["scalingStrategy"])
+	if scalingStrategy == "" {
+		scalingStrategy = "auto"
+	}
+
+	// Calculate base latency based on resource specs
+	baseLatencyMs := m.calculateBaseLatency(cpu, ramGb)
+
+	// Auto-scaling logic: adjust instance count based on load
+	currentLoad := len(queue)
+	if scalingStrategy == "auto" {
+		instanceCount = m.autoScale(instanceCount, currentLoad, rpsCapacity)
+		props["instanceCount"] = float64(instanceCount) // update for next tick
+	}
+
+	// Total capacity across all instances
+	totalCapacity := instanceCount * rpsCapacity
+
+	// Process requests up to total capacity
+	toProcess := queue
+	if len(queue) > totalCapacity {
+		toProcess = queue[:totalCapacity]
+	}
+
+	output := []*Request{}
+
+	// Distribute requests across instances using round-robin
+	for i, req := range toProcess {
+
+		// Create processed request copy
+		reqCopy := *req
+
+		// Add microservice processing latency
+		processingLatency := baseLatencyMs
+
+		// Simulate CPU-bound vs I/O-bound operations
+		if req.Type == "GET" {
+			processingLatency = baseLatencyMs // Fast reads
+		} else if req.Type == "POST" || req.Type == "PUT" {
+			processingLatency = baseLatencyMs + 10 // Writes take longer
+		} else if req.Type == "COMPUTE" {
+			processingLatency = baseLatencyMs + 50 // CPU-intensive operations
+		}
+
+		// Instance load affects latency (queuing delay)
+		instanceLoad := m.calculateInstanceLoad(i, len(toProcess), instanceCount)
+		if float64(instanceLoad) > float64(rpsCapacity)*0.8 { // Above 80% capacity
+			processingLatency += int(float64(processingLatency) * 0.5) // 50% penalty
+		}
+
+		reqCopy.LatencyMS += processingLatency
+		reqCopy.Path = append(reqCopy.Path, "microservice-processed")
+
+		output = append(output, &reqCopy)
+	}
+
+	// Health check: service is healthy if not severely overloaded
+	healthy := len(queue) <= totalCapacity*2 // Allow some buffering
+
+	return output, healthy
+}
+
+// calculateBaseLatency determines base processing time based on resources
+func (m MicroserviceLogic) calculateBaseLatency(cpu, ramGb int) int {
+	// Better CPU and RAM = lower base latency
+	// Formula: base latency inversely proportional to resources
+	cpuFactor := float64(cpu)
+	ramFactor := float64(ramGb) / 4.0 // Normalize to 4GB baseline
+
+	resourceScore := cpuFactor * ramFactor
+	if resourceScore < 1 {
+		resourceScore = 1
+	}
+
+	baseLatency := int(50.0 / resourceScore) // 50ms baseline for 2CPU/4GB
+	if baseLatency < 5 {
+		baseLatency = 5 // Minimum 5ms processing time
+	}
+
+	return baseLatency
+}
+
+// autoScale implements simple auto-scaling logic
+func (m MicroserviceLogic) autoScale(currentInstances, currentLoad, rpsPerInstance int) int {
+	// Calculate desired instances based on current load
+	desiredInstances := int(math.Ceil(float64(currentLoad) / float64(rpsPerInstance)))
+
+	// Scale up/down gradually (max 25% change per tick)
+	maxChange := int(math.Max(1, float64(currentInstances)*0.25))
+
+	if desiredInstances > currentInstances {
+		// Scale up
+		newInstances := currentInstances + maxChange
+		if newInstances > desiredInstances {
+			newInstances = desiredInstances
+		}
+		// Cap at reasonable maximum
+		if newInstances > 20 {
+			newInstances = 20
+		}
+		return newInstances
+	} else if desiredInstances < currentInstances {
+		// Scale down (more conservative)
+		newInstances := currentInstances - int(math.Max(1, float64(maxChange)*0.5))
+		if newInstances < desiredInstances {
+			newInstances = desiredInstances
+		}
+		// Always maintain at least 1 instance
+		if newInstances < 1 {
+			newInstances = 1
+		}
+		return newInstances
+	}
+
+	return currentInstances
+}
+
+// calculateInstanceLoad estimates load on a specific instance
+func (m MicroserviceLogic) calculateInstanceLoad(instanceID, totalRequests, instanceCount int) int {
+	// Simple round-robin distribution
+	baseLoad := totalRequests / instanceCount
+	remainder := totalRequests % instanceCount
+
+	if instanceID < remainder {
+		return baseLoad + 1
+	}
+	return baseLoad
+}
--- a/internal/simulation/microservice_test.go
+++ b/internal/simulation/microservice_test.go
@ -0,0 +1,286 @@
				@@ -0,0 +1,286 @@
+package simulation
+
+import (
+	"testing"
+)
+
+func TestMicroserviceLogic_BasicProcessing(t *testing.T) {
+	logic := MicroserviceLogic{}
+
+	props := map[string]any{
+		"instanceCount":   2.0,
+		"cpu":             4.0,
+		"ramGb":           8.0,
+		"rpsCapacity":     100.0,
+		"scalingStrategy": "manual",
+	}
+
+	requests := []*Request{
+		{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}},
+		{ID: "2", Type: "POST", LatencyMS: 0, Path: []string{}},
+	}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected microservice to be healthy")
+	}
+
+	if len(output) != 2 {
+		t.Errorf("Expected 2 processed requests, got %d", len(output))
+	}
+
+	// Verify latency was added
+	for _, req := range output {
+		if req.LatencyMS == 0 {
+			t.Error("Expected latency to be added to processed request")
+		}
+		if len(req.Path) == 0 || req.Path[len(req.Path)-1] != "microservice-processed" {
+			t.Error("Expected path to be updated with microservice-processed")
+		}
+	}
+}
+
+func TestMicroserviceLogic_CapacityLimit(t *testing.T) {
+	logic := MicroserviceLogic{}
+
+	props := map[string]any{
+		"instanceCount":   1.0,
+		"rpsCapacity":     2.0,
+		"scalingStrategy": "manual",
+	}
+
+	// Send 4 requests, capacity is 2 (1 instance * 2 RPS)
+	// This should be healthy since 4 <= totalCapacity*2 (4)
+	requests := make([]*Request, 4)
+	for i := range requests {
+		requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0}
+	}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected microservice to be healthy with moderate queuing")
+	}
+
+	// Should only process 2 requests (capacity limit)
+	if len(output) != 2 {
+		t.Errorf("Expected 2 processed requests due to capacity limit, got %d", len(output))
+	}
+}
+
+func TestMicroserviceLogic_AutoScaling(t *testing.T) {
+	logic := MicroserviceLogic{}
+
+	props := map[string]any{
+		"instanceCount":   1.0,
+		"rpsCapacity":     10.0,
+		"scalingStrategy": "auto",
+	}
+
+	// Send 25 requests to trigger scaling
+	requests := make([]*Request, 25)
+	for i := range requests {
+		requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0}
+	}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	// Check if instances were scaled up
+	newInstanceCount := int(props["instanceCount"].(float64))
+	if newInstanceCount <= 1 {
+		t.Error("Expected auto-scaling to increase instance count")
+	}
+
+	// Should process more than 10 requests (original capacity)
+	if len(output) <= 10 {
+		t.Errorf("Expected auto-scaling to increase processing capacity, got %d", len(output))
+	}
+
+	if !healthy {
+		t.Error("Expected microservice to be healthy after scaling")
+	}
+}
+
+func TestMicroserviceLogic_ResourceBasedLatency(t *testing.T) {
+	logic := MicroserviceLogic{}
+
+	// High-resource microservice
+	highResourceProps := map[string]any{
+		"instanceCount":   1.0,
+		"cpu":             8.0,
+		"ramGb":           16.0,
+		"rpsCapacity":     100.0,
+		"scalingStrategy": "manual",
+	}
+
+	// Low-resource microservice
+	lowResourceProps := map[string]any{
+		"instanceCount":   1.0,
+		"cpu":             1.0,
+		"ramGb":           1.0,
+		"rpsCapacity":     100.0,
+		"scalingStrategy": "manual",
+	}
+
+	request := []*Request{{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}}
+
+	highOutput, _ := logic.Tick(highResourceProps, request, 1)
+	lowOutput, _ := logic.Tick(lowResourceProps, request, 1)
+
+	highLatency := highOutput[0].LatencyMS
+	lowLatency := lowOutput[0].LatencyMS
+
+	if lowLatency <= highLatency {
+		t.Errorf("Expected low-resource microservice (%dms) to have higher latency than high-resource (%dms)",
+			lowLatency, highLatency)
+	}
+}
+
+func TestMicroserviceLogic_RequestTypeLatency(t *testing.T) {
+	logic := MicroserviceLogic{}
+
+	props := map[string]any{
+		"instanceCount":   1.0,
+		"cpu":             2.0,
+		"ramGb":           4.0,
+		"rpsCapacity":     100.0,
+		"scalingStrategy": "manual",
+	}
+
+	getRequest := []*Request{{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}}
+	postRequest := []*Request{{ID: "2", Type: "POST", LatencyMS: 0, Path: []string{}}}
+	computeRequest := []*Request{{ID: "3", Type: "COMPUTE", LatencyMS: 0, Path: []string{}}}
+
+	getOutput, _ := logic.Tick(props, getRequest, 1)
+	postOutput, _ := logic.Tick(props, postRequest, 1)
+	computeOutput, _ := logic.Tick(props, computeRequest, 1)
+
+	getLatency := getOutput[0].LatencyMS
+	postLatency := postOutput[0].LatencyMS
+	computeLatency := computeOutput[0].LatencyMS
+
+	if getLatency >= postLatency {
+		t.Errorf("Expected GET (%dms) to be faster than POST (%dms)", getLatency, postLatency)
+	}
+
+	if postLatency >= computeLatency {
+		t.Errorf("Expected POST (%dms) to be faster than COMPUTE (%dms)", postLatency, computeLatency)
+	}
+}
+
+func TestMicroserviceLogic_HighLoadLatencyPenalty(t *testing.T) {
+	logic := MicroserviceLogic{}
+
+	props := map[string]any{
+		"instanceCount":   1.0,
+		"cpu":             2.0,
+		"ramGb":           4.0,
+		"rpsCapacity":     10.0,
+		"scalingStrategy": "manual",
+	}
+
+	// Low load scenario
+	lowLoadRequest := []*Request{{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}}
+	lowOutput, _ := logic.Tick(props, lowLoadRequest, 1)
+	lowLatency := lowOutput[0].LatencyMS
+
+	// High load scenario (above 80% capacity threshold)
+	highLoadRequests := make([]*Request, 9) // 90% of 10 RPS capacity
+	for i := range highLoadRequests {
+		highLoadRequests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0, Path: []string{}}
+	}
+	highOutput, _ := logic.Tick(props, highLoadRequests, 1)
+
+	// Check if first request has higher latency due to load
+	highLatency := highOutput[0].LatencyMS
+
+	if highLatency <= lowLatency {
+		t.Errorf("Expected high load scenario (%dms) to have higher latency than low load (%dms)",
+			highLatency, lowLatency)
+	}
+}
+
+func TestMicroserviceLogic_DefaultValues(t *testing.T) {
+	logic := MicroserviceLogic{}
+
+	// Empty props should use defaults
+	props := map[string]any{}
+
+	requests := []*Request{{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected microservice to be healthy with default values")
+	}
+
+	if len(output) != 1 {
+		t.Errorf("Expected 1 processed request with defaults, got %d", len(output))
+	}
+
+	// Should have reasonable default latency
+	if output[0].LatencyMS <= 0 || output[0].LatencyMS > 100 {
+		t.Errorf("Expected reasonable default latency, got %dms", output[0].LatencyMS)
+	}
+}
+
+func TestMicroserviceLogic_UnhealthyWhenOverloaded(t *testing.T) {
+	logic := MicroserviceLogic{}
+
+	props := map[string]any{
+		"instanceCount":   1.0,
+		"rpsCapacity":     5.0,
+		"scalingStrategy": "manual", // No auto-scaling
+	}
+
+	// Send way more requests than capacity (5 * 2 = 10 max before unhealthy)
+	requests := make([]*Request, 15) // 3x capacity
+	for i := range requests {
+		requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0}
+	}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if healthy {
+		t.Error("Expected microservice to be unhealthy when severely overloaded")
+	}
+
+	// Should still process up to capacity
+	if len(output) != 5 {
+		t.Errorf("Expected 5 processed requests despite being overloaded, got %d", len(output))
+	}
+}
+
+func TestMicroserviceLogic_RoundRobinDistribution(t *testing.T) {
+	logic := MicroserviceLogic{}
+
+	props := map[string]any{
+		"instanceCount":   3.0,
+		"rpsCapacity":     10.0,
+		"scalingStrategy": "manual",
+	}
+
+	// Send 6 requests to be distributed across 3 instances
+	requests := make([]*Request, 6)
+	for i := range requests {
+		requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0, Path: []string{}}
+	}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected microservice to be healthy")
+	}
+
+	if len(output) != 6 {
+		t.Errorf("Expected 6 processed requests, got %d", len(output))
+	}
+
+	// All requests should be processed (within total capacity of 30)
+	for _, req := range output {
+		if req.LatencyMS <= 0 {
+			t.Error("Expected all requests to have added latency")
+		}
+	}
+}
--- a/internal/simulation/monitoring.go
+++ b/internal/simulation/monitoring.go
@ -0,0 +1,221 @@
				@@ -0,0 +1,221 @@
+package simulation
+
+type MonitoringLogic struct{}
+
+type MetricData struct {
+	Timestamp    int
+	LatencySum   int
+	RequestCount int
+	ErrorCount   int
+	QueueSize    int
+}
+
+type AlertEvent struct {
+	Timestamp  int
+	MetricType string
+	Value      float64
+	Threshold  float64
+	Unit       string
+	Severity   string
+}
+
+func (m MonitoringLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
+	// Extract monitoring properties
+	tool := AsString(props["tool"])
+	if tool == "" {
+		tool = "Prometheus" // default monitoring tool
+	}
+
+	alertMetric := AsString(props["alertMetric"])
+	if alertMetric == "" {
+		alertMetric = "latency" // default to latency monitoring
+	}
+
+	thresholdValue := int(AsFloat64(props["thresholdValue"]))
+	if thresholdValue == 0 {
+		thresholdValue = 100 // default threshold
+	}
+
+	thresholdUnit := AsString(props["thresholdUnit"])
+	if thresholdUnit == "" {
+		thresholdUnit = "ms" // default unit
+	}
+
+	// Get historical metrics from props
+	metrics, ok := props["_metrics"].([]MetricData)
+	if !ok {
+		metrics = []MetricData{}
+	}
+
+	// Get alert history
+	alerts, ok := props["_alerts"].([]AlertEvent)
+	if !ok {
+		alerts = []AlertEvent{}
+	}
+
+	currentTime := tick * 100 // Convert tick to milliseconds
+
+	// Process all incoming requests (monitoring is pass-through)
+	output := []*Request{}
+	totalLatency := 0
+	errorCount := 0
+
+	for _, req := range queue {
+		// Create a copy of the request to forward
+		reqCopy := *req
+
+		// Add minimal monitoring overhead (1-2ms for metric collection)
+		monitoringOverhead := 1
+		if tool == "Datadog" || tool == "New Relic" {
+			monitoringOverhead = 2 // More feature-rich tools have slightly higher overhead
+		}
+
+		reqCopy.LatencyMS += monitoringOverhead
+		reqCopy.Path = append(reqCopy.Path, "monitored")
+
+		// Collect metrics from the request
+		totalLatency += req.LatencyMS
+
+		// Simple heuristic: requests with high latency are considered errors
+		if req.LatencyMS > 1000 { // 1 second threshold for errors
+			errorCount++
+		}
+
+		output = append(output, &reqCopy)
+	}
+
+	// Calculate current metrics
+	avgLatency := 0.0
+	if len(queue) > 0 {
+		avgLatency = float64(totalLatency) / float64(len(queue))
+	}
+
+	// Store current metrics
+	currentMetric := MetricData{
+		Timestamp:    currentTime,
+		LatencySum:   totalLatency,
+		RequestCount: len(queue),
+		ErrorCount:   errorCount,
+		QueueSize:    len(queue),
+	}
+
+	// Add to metrics history (keep last 10 data points)
+	metrics = append(metrics, currentMetric)
+	if len(metrics) > 10 {
+		metrics = metrics[1:]
+	}
+
+	// Check alert conditions
+	shouldAlert := false
+	alertValue := 0.0
+
+	switch alertMetric {
+	case "latency":
+		alertValue = avgLatency
+		if avgLatency > float64(thresholdValue) && len(queue) > 0 {
+			shouldAlert = true
+		}
+	case "throughput":
+		alertValue = float64(len(queue))
+		if len(queue) < thresholdValue { // Low throughput alert
+			shouldAlert = true
+		}
+	case "error_rate":
+		errorRate := 0.0
+		if len(queue) > 0 {
+			errorRate = float64(errorCount) / float64(len(queue)) * 100
+		}
+		alertValue = errorRate
+		if errorRate > float64(thresholdValue) {
+			shouldAlert = true
+		}
+	case "queue_size":
+		alertValue = float64(len(queue))
+		if len(queue) > thresholdValue {
+			shouldAlert = true
+		}
+	}
+
+	// Generate alert if threshold exceeded
+	if shouldAlert {
+		severity := "warning"
+		if alertValue > float64(thresholdValue)*1.5 { // 150% of threshold
+			severity = "critical"
+		}
+
+		alert := AlertEvent{
+			Timestamp:  currentTime,
+			MetricType: alertMetric,
+			Value:      alertValue,
+			Threshold:  float64(thresholdValue),
+			Unit:       thresholdUnit,
+			Severity:   severity,
+		}
+
+		// Only add alert if it's not a duplicate of the last alert
+		if len(alerts) == 0 || !m.isDuplicateAlert(alerts[len(alerts)-1], alert) {
+			alerts = append(alerts, alert)
+		}
+
+		// Keep only last 20 alerts
+		if len(alerts) > 20 {
+			alerts = alerts[1:]
+		}
+	}
+
+	// Update props with collected data
+	props["_metrics"] = metrics
+	props["_alerts"] = alerts
+	props["_currentLatency"] = avgLatency
+	props["_alertCount"] = len(alerts)
+
+	// Monitoring system health - it's healthy unless it's completely overloaded
+	healthy := len(queue) < 10000 // Can handle very high loads
+
+	// If too many critical alerts recently, mark as unhealthy
+	recentCriticalAlerts := 0
+	for _, alert := range alerts {
+		if currentTime-alert.Timestamp < 10000 && alert.Severity == "critical" { // Last 10 seconds
+			recentCriticalAlerts++
+		}
+	}
+
+	if recentCriticalAlerts > 5 {
+		healthy = false
+	}
+
+	return output, healthy
+}
+
+// isDuplicateAlert checks if an alert is similar to the previous one to avoid spam
+func (m MonitoringLogic) isDuplicateAlert(prev, current AlertEvent) bool {
+	return prev.MetricType == current.MetricType &&
+		prev.Severity == current.Severity &&
+		(current.Timestamp-prev.Timestamp) < 5000 // Within 5 seconds
+}
+
+// Helper function to calculate moving average
+func (m MonitoringLogic) calculateMovingAverage(metrics []MetricData, window int) float64 {
+	if len(metrics) == 0 {
+		return 0
+	}
+
+	start := 0
+	if len(metrics) > window {
+		start = len(metrics) - window
+	}
+
+	sum := 0.0
+	count := 0
+	for i := start; i < len(metrics); i++ {
+		if metrics[i].RequestCount > 0 {
+			sum += float64(metrics[i].LatencySum) / float64(metrics[i].RequestCount)
+			count++
+		}
+	}
+
+	if count == 0 {
+		return 0
+	}
+	return sum / float64(count)
+}
--- a/internal/simulation/monitoring_test.go
+++ b/internal/simulation/monitoring_test.go
@ -0,0 +1,411 @@
				@@ -0,0 +1,411 @@
+package simulation
+
+import (
+	"testing"
+)
+
+func TestMonitoringLogic_BasicPassthrough(t *testing.T) {
+	logic := MonitoringLogic{}
+
+	props := map[string]any{
+		"tool":           "Prometheus",
+		"alertMetric":    "latency",
+		"thresholdValue": 100.0,
+		"thresholdUnit":  "ms",
+	}
+
+	requests := []*Request{
+		{ID: "1", Type: "GET", LatencyMS: 50, Path: []string{}},
+		{ID: "2", Type: "POST", LatencyMS: 75, Path: []string{}},
+	}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected monitoring to be healthy")
+	}
+
+	if len(output) != 2 {
+		t.Errorf("Expected 2 requests to pass through monitoring, got %d", len(output))
+	}
+
+	// Verify minimal latency overhead was added
+	for i, req := range output {
+		originalLatency := requests[i].LatencyMS
+		if req.LatencyMS <= originalLatency {
+			t.Errorf("Expected monitoring overhead to be added to latency")
+		}
+		if req.LatencyMS > originalLatency+5 {
+			t.Errorf("Expected minimal monitoring overhead, got %d ms added", req.LatencyMS-originalLatency)
+		}
+		if len(req.Path) == 0 || req.Path[len(req.Path)-1] != "monitored" {
+			t.Error("Expected path to be updated with 'monitored'")
+		}
+	}
+}
+
+func TestMonitoringLogic_MetricsCollection(t *testing.T) {
+	logic := MonitoringLogic{}
+
+	props := map[string]any{
+		"tool":           "Datadog",
+		"alertMetric":    "latency",
+		"thresholdValue": 100.0,
+		"thresholdUnit":  "ms",
+	}
+
+	requests := []*Request{
+		{ID: "1", Type: "GET", LatencyMS: 50},
+		{ID: "2", Type: "POST", LatencyMS: 150},
+		{ID: "3", Type: "GET", LatencyMS: 75},
+	}
+
+	_, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected monitoring to be healthy")
+	}
+
+	// Check that metrics were collected
+	metrics, ok := props["_metrics"].([]MetricData)
+	if !ok {
+		t.Error("Expected metrics to be collected in props")
+	}
+
+	if len(metrics) != 1 {
+		t.Errorf("Expected 1 metric data point, got %d", len(metrics))
+	}
+
+	metric := metrics[0]
+	if metric.RequestCount != 3 {
+		t.Errorf("Expected 3 requests counted, got %d", metric.RequestCount)
+	}
+
+	if metric.LatencySum != 275 { // 50 + 150 + 75
+		t.Errorf("Expected latency sum of 275, got %d", metric.LatencySum)
+	}
+
+	// Check current latency calculation
+	currentLatency, ok := props["_currentLatency"].(float64)
+	if !ok {
+		t.Error("Expected current latency to be calculated")
+	}
+
+	if currentLatency < 90 || currentLatency > 95 {
+		t.Errorf("Expected average latency around 91.67, got %f", currentLatency)
+	}
+}
+
+func TestMonitoringLogic_LatencyAlert(t *testing.T) {
+	logic := MonitoringLogic{}
+
+	props := map[string]any{
+		"tool":           "Prometheus",
+		"alertMetric":    "latency",
+		"thresholdValue": 80.0,
+		"thresholdUnit":  "ms",
+	}
+
+	// Send requests that exceed latency threshold
+	requests := []*Request{
+		{ID: "1", Type: "GET", LatencyMS: 100},
+		{ID: "2", Type: "POST", LatencyMS: 120},
+	}
+
+	_, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected monitoring to be healthy despite alerts")
+	}
+
+	// Check that alert was generated
+	alerts, ok := props["_alerts"].([]AlertEvent)
+	if !ok {
+		t.Error("Expected alerts to be stored in props")
+	}
+
+	if len(alerts) != 1 {
+		t.Errorf("Expected 1 alert to be generated, got %d", len(alerts))
+	}
+
+	alert := alerts[0]
+	if alert.MetricType != "latency" {
+		t.Errorf("Expected latency alert, got %s", alert.MetricType)
+	}
+
+	if alert.Threshold != 80.0 {
+		t.Errorf("Expected threshold of 80, got %f", alert.Threshold)
+	}
+
+	if alert.Value < 80.0 {
+		t.Errorf("Expected alert value to exceed threshold, got %f", alert.Value)
+	}
+
+	if alert.Severity != "warning" {
+		t.Errorf("Expected warning severity, got %s", alert.Severity)
+	}
+}
+
+func TestMonitoringLogic_ErrorRateAlert(t *testing.T) {
+	logic := MonitoringLogic{}
+
+	props := map[string]any{
+		"tool":           "Prometheus",
+		"alertMetric":    "error_rate",
+		"thresholdValue": 20.0, // 20% error rate threshold
+		"thresholdUnit":  "percent",
+	}
+
+	// Send mix of normal and high-latency (error) requests
+	requests := []*Request{
+		{ID: "1", Type: "GET", LatencyMS: 100},   // normal
+		{ID: "2", Type: "POST", LatencyMS: 1200}, // error (>1000ms)
+		{ID: "3", Type: "GET", LatencyMS: 200},   // normal
+		{ID: "4", Type: "POST", LatencyMS: 1500}, // error
+	}
+
+	_, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected monitoring to be healthy")
+	}
+
+	// Check that error rate alert was generated (50% error rate > 20% threshold)
+	alerts, ok := props["_alerts"].([]AlertEvent)
+	if !ok {
+		t.Error("Expected alerts to be stored in props")
+	}
+
+	if len(alerts) != 1 {
+		t.Errorf("Expected 1 alert to be generated, got %d", len(alerts))
+	}
+
+	alert := alerts[0]
+	if alert.MetricType != "error_rate" {
+		t.Errorf("Expected error_rate alert, got %s", alert.MetricType)
+	}
+
+	if alert.Value != 50.0 { // 2 errors out of 4 requests = 50%
+		t.Errorf("Expected 50%% error rate, got %f", alert.Value)
+	}
+}
+
+func TestMonitoringLogic_QueueSizeAlert(t *testing.T) {
+	logic := MonitoringLogic{}
+
+	props := map[string]any{
+		"tool":           "Prometheus",
+		"alertMetric":    "queue_size",
+		"thresholdValue": 5.0,
+		"thresholdUnit":  "requests",
+	}
+
+	// Send more requests than threshold
+	requests := make([]*Request, 8)
+	for i := range requests {
+		requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 50}
+	}
+
+	_, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected monitoring to be healthy with queue size alert")
+	}
+
+	// Check that queue size alert was generated
+	alerts, ok := props["_alerts"].([]AlertEvent)
+	if !ok {
+		t.Error("Expected alerts to be stored in props")
+	}
+
+	if len(alerts) != 1 {
+		t.Errorf("Expected 1 alert to be generated, got %d", len(alerts))
+	}
+
+	alert := alerts[0]
+	if alert.MetricType != "queue_size" {
+		t.Errorf("Expected queue_size alert, got %s", alert.MetricType)
+	}
+
+	if alert.Value != 8.0 {
+		t.Errorf("Expected queue size of 8, got %f", alert.Value)
+	}
+}
+
+func TestMonitoringLogic_CriticalAlert(t *testing.T) {
+	logic := MonitoringLogic{}
+
+	props := map[string]any{
+		"tool":           "Prometheus",
+		"alertMetric":    "latency",
+		"thresholdValue": 100.0,
+		"thresholdUnit":  "ms",
+	}
+
+	// Send requests with very high latency (150% of threshold)
+	requests := []*Request{
+		{ID: "1", Type: "GET", LatencyMS: 180}, // 180 > 150 (1.5 * 100)
+		{ID: "2", Type: "POST", LatencyMS: 200},
+	}
+
+	_, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected monitoring to be healthy")
+	}
+
+	alerts, ok := props["_alerts"].([]AlertEvent)
+	if !ok {
+		t.Error("Expected alerts to be stored in props")
+	}
+
+	if len(alerts) != 1 {
+		t.Errorf("Expected 1 alert to be generated, got %d", len(alerts))
+	}
+
+	alert := alerts[0]
+	if alert.Severity != "critical" {
+		t.Errorf("Expected critical severity for high threshold breach, got %s", alert.Severity)
+	}
+}
+
+func TestMonitoringLogic_DuplicateAlertSuppression(t *testing.T) {
+	logic := MonitoringLogic{}
+
+	props := map[string]any{
+		"tool":           "Prometheus",
+		"alertMetric":    "latency",
+		"thresholdValue": 80.0,
+		"thresholdUnit":  "ms",
+	}
+
+	requests := []*Request{
+		{ID: "1", Type: "GET", LatencyMS: 100},
+	}
+
+	// First tick - should generate alert
+	logic.Tick(props, requests, 1)
+
+	alerts, _ := props["_alerts"].([]AlertEvent)
+	if len(alerts) != 1 {
+		t.Errorf("Expected 1 alert after first tick, got %d", len(alerts))
+	}
+
+	// Second tick immediately after - should suppress duplicate
+	logic.Tick(props, requests, 2)
+
+	alerts, _ = props["_alerts"].([]AlertEvent)
+	if len(alerts) != 1 {
+		t.Errorf("Expected duplicate alert to be suppressed, got %d alerts", len(alerts))
+	}
+}
+
+func TestMonitoringLogic_DefaultValues(t *testing.T) {
+	logic := MonitoringLogic{}
+
+	// Empty props should use defaults
+	props := map[string]any{}
+
+	requests := []*Request{{ID: "1", Type: "GET", LatencyMS: 50, Path: []string{}}}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected monitoring to be healthy with default values")
+	}
+
+	if len(output) != 1 {
+		t.Errorf("Expected 1 request to pass through, got %d", len(output))
+	}
+
+	// Should have reasonable default monitoring overhead
+	if output[0].LatencyMS <= 50 || output[0].LatencyMS > 55 {
+		t.Errorf("Expected default monitoring overhead, got %dms total", output[0].LatencyMS)
+	}
+}
+
+func TestMonitoringLogic_ToolSpecificOverhead(t *testing.T) {
+	logic := MonitoringLogic{}
+
+	// Test Prometheus (lower overhead)
+	propsPrometheus := map[string]any{
+		"tool": "Prometheus",
+	}
+
+	// Test Datadog (higher overhead)
+	propsDatadog := map[string]any{
+		"tool": "Datadog",
+	}
+
+	request := []*Request{{ID: "1", Type: "GET", LatencyMS: 50, Path: []string{}}}
+
+	prometheusOutput, _ := logic.Tick(propsPrometheus, request, 1)
+	datadogOutput, _ := logic.Tick(propsDatadog, request, 1)
+
+	prometheusOverhead := prometheusOutput[0].LatencyMS - 50
+	datadogOverhead := datadogOutput[0].LatencyMS - 50
+
+	if datadogOverhead <= prometheusOverhead {
+		t.Errorf("Expected Datadog (%dms) to have higher overhead than Prometheus (%dms)",
+			datadogOverhead, prometheusOverhead)
+	}
+}
+
+func TestMonitoringLogic_UnhealthyWithManyAlerts(t *testing.T) {
+	logic := MonitoringLogic{}
+
+	props := map[string]any{
+		"tool":           "Prometheus",
+		"alertMetric":    "latency",
+		"thresholdValue": 50.0,
+		"thresholdUnit":  "ms",
+	}
+
+	// Manually create many recent critical alerts to simulate an unhealthy state
+	currentTime := 10000 // 10 seconds
+	recentAlerts := []AlertEvent{
+		{Timestamp: currentTime - 1000, MetricType: "latency", Severity: "critical", Value: 200},
+		{Timestamp: currentTime - 2000, MetricType: "latency", Severity: "critical", Value: 180},
+		{Timestamp: currentTime - 3000, MetricType: "latency", Severity: "critical", Value: 190},
+		{Timestamp: currentTime - 4000, MetricType: "latency", Severity: "critical", Value: 170},
+		{Timestamp: currentTime - 5000, MetricType: "latency", Severity: "critical", Value: 160},
+		{Timestamp: currentTime - 6000, MetricType: "latency", Severity: "critical", Value: 150},
+	}
+
+	// Set up the props with existing critical alerts
+	props["_alerts"] = recentAlerts
+
+	// Make a request that would trigger another alert (low latency to avoid triggering new alert)
+	requests := []*Request{{ID: "1", Type: "GET", LatencyMS: 40}}
+
+	// This tick should recognize the existing critical alerts and mark system as unhealthy
+	_, healthy := logic.Tick(props, requests, 100) // tick 100 = 10000ms
+
+	if healthy {
+		t.Error("Expected monitoring to be unhealthy due to many recent critical alerts")
+	}
+}
+
+func TestMonitoringLogic_MetricsHistoryLimit(t *testing.T) {
+	logic := MonitoringLogic{}
+
+	props := map[string]any{
+		"tool": "Prometheus",
+	}
+
+	request := []*Request{{ID: "1", Type: "GET", LatencyMS: 50}}
+
+	// Generate more than 10 metric data points
+	for i := 0; i < 15; i++ {
+		logic.Tick(props, request, i)
+	}
+
+	metrics, ok := props["_metrics"].([]MetricData)
+	if !ok {
+		t.Error("Expected metrics to be stored")
+	}
+
+	if len(metrics) != 10 {
+		t.Errorf("Expected metrics history to be limited to 10, got %d", len(metrics))
+	}
+}
--- a/internal/simulation/testdata/cache_design.json
+++ b/internal/simulation/testdata/cache_design.json
@ -0,0 +1,55 @@
				@@ -0,0 +1,55 @@
+{
+  "nodes": [
+    {
+      "id": "webserver",
+      "type": "webserver",
+      "position": { "x": 0, "y": 0 },
+      "props": {
+        "label": "Web Server",
+        "rpsCapacity": 100
+      }
+    },
+    {
+      "id": "cache",
+      "type": "cache",
+      "position": { "x": 100, "y": 0 },
+      "props": {
+        "label": "Redis Cache",
+        "cacheTTL": 300000,
+        "maxEntries": 1000,
+        "evictionPolicy": "LRU"
+      }
+    },
+    {
+      "id": "database",
+      "type": "database",
+      "position": { "x": 200, "y": 0 },
+      "props": {
+        "label": "Primary DB",
+        "replication": 2,
+        "maxRPS": 500,
+        "baseLatencyMs": 20
+      }
+    }
+  ],
+  "connections": [
+    {
+      "source": "webserver",
+      "target": "cache",
+      "label": "Cache Lookup",
+      "direction": "forward",
+      "protocol": "Redis",
+      "tls": false,
+      "capacity": 1000
+    },
+    {
+      "source": "cache",
+      "target": "database",
+      "label": "Cache Miss",
+      "direction": "forward",
+      "protocol": "TCP",
+      "tls": true,
+      "capacity": 1000
+    }
+  ]
+}
--- a/internal/simulation/testdata/database_design.json
+++ b/internal/simulation/testdata/database_design.json
@ -0,0 +1,35 @@
				@@ -0,0 +1,35 @@
+{
+  "nodes": [
+    {
+      "id": "webserver",
+      "type": "webserver",
+      "position": { "x": 0, "y": 0 },
+      "props": {
+        "label": "Web Server",
+        "rpsCapacity": 100
+      }
+    },
+    {
+      "id": "database",
+      "type": "database",
+      "position": { "x": 100, "y": 0 },
+      "props": {
+        "label": "Primary DB",
+        "replication": 2,
+        "maxRPS": 500,
+        "baseLatencyMs": 15
+      }
+    }
+  ],
+  "connections": [
+    {
+      "source": "webserver",
+      "target": "database",
+      "label": "DB Queries",
+      "direction": "forward",
+      "protocol": "TCP",
+      "tls": true,
+      "capacity": 1000
+    }
+  ]
+}
--- a/internal/simulation/testdata/datapipeline_design.json
+++ b/internal/simulation/testdata/datapipeline_design.json
@ -0,0 +1,188 @@
				@@ -0,0 +1,188 @@
+{
+  "nodes": [
+    {
+      "id": "data-source",
+      "type": "webserver",
+      "position": { "x": 100, "y": 200 },
+      "props": {
+        "label": "Data Ingestion API",
+        "rpsCapacity": 500
+      }
+    },
+    {
+      "id": "raw-data-queue",
+      "type": "messageQueue",
+      "position": { "x": 300, "y": 200 },
+      "props": {
+        "label": "Raw Data Queue",
+        "queueCapacity": 10000,
+        "retentionSeconds": 3600,
+        "processingRate": 200
+      }
+    },
+    {
+      "id": "etl-pipeline-1", 
+      "type": "data pipeline",
+      "position": { "x": 500, "y": 150 },
+      "props": {
+        "label": "Data Cleansing Pipeline",
+        "batchSize": 100,
+        "transformation": "validate"
+      }
+    },
+    {
+      "id": "etl-pipeline-2",
+      "type": "data pipeline",
+      "position": { "x": 500, "y": 250 },
+      "props": {
+        "label": "Data Transformation Pipeline",
+        "batchSize": 50,
+        "transformation": "aggregate"
+      }
+    },
+    {
+      "id": "ml-pipeline",
+      "type": "data pipeline",
+      "position": { "x": 700, "y": 150 },
+      "props": {
+        "label": "ML Feature Pipeline",
+        "batchSize": 200,
+        "transformation": "enrich"
+      }
+    },
+    {
+      "id": "analytics-pipeline",
+      "type": "data pipeline", 
+      "position": { "x": 700, "y": 250 },
+      "props": {
+        "label": "Analytics Pipeline",
+        "batchSize": 500,
+        "transformation": "join"
+      }
+    },
+    {
+      "id": "cache-1",
+      "type": "cache",
+      "position": { "x": 900, "y": 150 },
+      "props": {
+        "label": "Feature Cache",
+        "cacheTTL": 300,
+        "maxEntries": 50000,
+        "evictionPolicy": "LRU"
+      }
+    },
+    {
+      "id": "data-warehouse",
+      "type": "database",
+      "position": { "x": 900, "y": 250 },
+      "props": {
+        "label": "Data Warehouse",
+        "replication": 3,
+        "maxRPS": 1000,
+        "baseLatencyMs": 50
+      }
+    },
+    {
+      "id": "monitoring-1",
+      "type": "monitoring/alerting",
+      "position": { "x": 500, "y": 350 },
+      "props": {
+        "label": "Pipeline Monitor",
+        "tool": "Datadog",
+        "alertMetric": "latency",
+        "thresholdValue": 1000,
+        "thresholdUnit": "ms"
+      }
+    },
+    {
+      "id": "compression-pipeline",
+      "type": "data pipeline",
+      "position": { "x": 300, "y": 350 },
+      "props": {
+        "label": "Data Compression",
+        "batchSize": 1000,
+        "transformation": "compress"
+      }
+    }
+  ],
+  "connections": [
+    {
+      "source": "data-source",
+      "target": "raw-data-queue",
+      "label": "Raw Data Stream",
+      "protocol": "http"
+    },
+    {
+      "source": "raw-data-queue",
+      "target": "etl-pipeline-1",
+      "label": "Data Validation",
+      "protocol": "tcp"
+    },
+    {
+      "source": "raw-data-queue",
+      "target": "etl-pipeline-2",
+      "label": "Data Transformation",
+      "protocol": "tcp"
+    },
+    {
+      "source": "etl-pipeline-1",
+      "target": "ml-pipeline",
+      "label": "Clean Data",
+      "protocol": "tcp"
+    },
+    {
+      "source": "etl-pipeline-2",
+      "target": "analytics-pipeline",
+      "label": "Transformed Data",
+      "protocol": "tcp"
+    },
+    {
+      "source": "ml-pipeline",
+      "target": "cache-1",
+      "label": "ML Features",
+      "protocol": "tcp"
+    },
+    {
+      "source": "analytics-pipeline",
+      "target": "data-warehouse",
+      "label": "Analytics Data",
+      "protocol": "tcp"
+    },
+    {
+      "source": "etl-pipeline-1",
+      "target": "monitoring-1",
+      "label": "Pipeline Metrics",
+      "protocol": "http"
+    },
+    {
+      "source": "etl-pipeline-2",
+      "target": "monitoring-1",
+      "label": "Pipeline Metrics",
+      "protocol": "http"
+    },
+    {
+      "source": "ml-pipeline",
+      "target": "monitoring-1",
+      "label": "Pipeline Metrics",
+      "protocol": "http"
+    },
+    {
+      "source": "analytics-pipeline",
+      "target": "monitoring-1",
+      "label": "Pipeline Metrics",
+      "protocol": "http"
+    },
+    {
+      "source": "raw-data-queue",
+      "target": "compression-pipeline",
+      "label": "Archive Stream",
+      "protocol": "tcp"
+    },
+    {
+      "source": "compression-pipeline",
+      "target": "data-warehouse",
+      "label": "Compressed Archive",
+      "protocol": "tcp"
+    }
+  ]
+}
--- a/internal/simulation/testdata/messagequeue_design.json
+++ b/internal/simulation/testdata/messagequeue_design.json
@ -0,0 +1,53 @@
				@@ -0,0 +1,53 @@
+{
+  "nodes": [
+    {
+      "id": "producer",
+      "type": "webserver",
+      "position": { "x": 0, "y": 0 },
+      "props": {
+        "label": "Message Producer",
+        "rpsCapacity": 50
+      }
+    },
+    {
+      "id": "messagequeue",
+      "type": "messageQueue",
+      "position": { "x": 100, "y": 0 },
+      "props": {
+        "label": "Event Queue",
+        "queueCapacity": 1000,
+        "retentionSeconds": 3600,
+        "processingRate": 100
+      }
+    },
+    {
+      "id": "consumer",
+      "type": "webserver",
+      "position": { "x": 200, "y": 0 },
+      "props": {
+        "label": "Message Consumer",
+        "rpsCapacity": 80
+      }
+    }
+  ],
+  "connections": [
+    {
+      "source": "producer",
+      "target": "messagequeue",
+      "label": "Publish Messages",
+      "direction": "forward",
+      "protocol": "AMQP",
+      "tls": false,
+      "capacity": 1000
+    },
+    {
+      "source": "messagequeue",
+      "target": "consumer",
+      "label": "Consume Messages",
+      "direction": "forward",
+      "protocol": "AMQP",
+      "tls": false,
+      "capacity": 1000
+    }
+  ]
+}
--- a/internal/simulation/testdata/microservice_design.json
+++ b/internal/simulation/testdata/microservice_design.json
@ -0,0 +1,96 @@
				@@ -0,0 +1,96 @@
+{
+  "nodes": [
+    {
+      "id": "webserver-1",
+      "type": "webserver",
+      "position": { "x": 100, "y": 200 },
+      "props": {
+        "label": "API Gateway",
+        "rpsCapacity": 200
+      }
+    },
+    {
+      "id": "lb-1", 
+      "type": "loadbalancer",
+      "position": { "x": 300, "y": 200 },
+      "props": {
+        "label": "API Gateway",
+        "algorithm": "round-robin"
+      }
+    },
+    {
+      "id": "microservice-1",
+      "type": "microservice",
+      "position": { "x": 500, "y": 150 },
+      "props": {
+        "label": "User Service",
+        "instanceCount": 3,
+        "cpu": 4,
+        "ramGb": 8,
+        "rpsCapacity": 100,
+        "monthlyUsd": 150,
+        "scalingStrategy": "auto",
+        "apiVersion": "v2"
+      }
+    },
+    {
+      "id": "microservice-2", 
+      "type": "microservice",
+      "position": { "x": 500, "y": 250 },
+      "props": {
+        "label": "Order Service",
+        "instanceCount": 2,
+        "cpu": 2,
+        "ramGb": 4,
+        "rpsCapacity": 80,
+        "monthlyUsd": 90,
+        "scalingStrategy": "manual",
+        "apiVersion": "v1"
+      }
+    },
+    {
+      "id": "db-1",
+      "type": "database",
+      "position": { "x": 700, "y": 200 },
+      "props": {
+        "label": "PostgreSQL",
+        "replication": 2,
+        "maxRPS": 500,
+        "baseLatencyMs": 15
+      }
+    }
+  ],
+  "connections": [
+    {
+      "source": "webserver-1",
+      "target": "lb-1",
+      "label": "HTTPS Requests",
+      "protocol": "https",
+      "tls": true
+    },
+    {
+      "source": "lb-1", 
+      "target": "microservice-1",
+      "label": "User API",
+      "protocol": "http"
+    },
+    {
+      "source": "lb-1",
+      "target": "microservice-2", 
+      "label": "Order API",
+      "protocol": "http"
+    },
+    {
+      "source": "microservice-1",
+      "target": "db-1",
+      "label": "User Queries",
+      "protocol": "tcp"
+    },
+    {
+      "source": "microservice-2",
+      "target": "db-1",
+      "label": "Order Queries", 
+      "protocol": "tcp"
+    }
+  ]
+}
--- a/internal/simulation/testdata/monitoring_design.json
+++ b/internal/simulation/testdata/monitoring_design.json
@ -0,0 +1,127 @@
				@@ -0,0 +1,127 @@
+{
+  "nodes": [
+    {
+      "id": "webserver-1",
+      "type": "webserver",
+      "position": { "x": 100, "y": 200 },
+      "props": {
+        "label": "Web Server",
+        "rpsCapacity": 100
+      }
+    },
+    {
+      "id": "monitor-1", 
+      "type": "monitoring/alerting",
+      "position": { "x": 300, "y": 200 },
+      "props": {
+        "label": "Prometheus Monitor",
+        "tool": "Prometheus",
+        "alertMetric": "latency",
+        "thresholdValue": 80,
+        "thresholdUnit": "ms"
+      }
+    },
+    {
+      "id": "lb-1",
+      "type": "loadbalancer",
+      "position": { "x": 500, "y": 200 },
+      "props": {
+        "label": "Load Balancer",
+        "algorithm": "round-robin"
+      }
+    },
+    {
+      "id": "microservice-1",
+      "type": "microservice",
+      "position": { "x": 700, "y": 150 },
+      "props": {
+        "label": "User Service",
+        "instanceCount": 2,
+        "cpu": 2,
+        "ramGb": 4,
+        "rpsCapacity": 50,
+        "scalingStrategy": "auto"
+      }
+    },
+    {
+      "id": "microservice-2", 
+      "type": "microservice",
+      "position": { "x": 700, "y": 250 },
+      "props": {
+        "label": "Order Service",
+        "instanceCount": 1,
+        "cpu": 1,
+        "ramGb": 2,
+        "rpsCapacity": 30,
+        "scalingStrategy": "manual"
+      }
+    },
+    {
+      "id": "monitor-2",
+      "type": "monitoring/alerting",
+      "position": { "x": 900, "y": 200 },
+      "props": {
+        "label": "Error Rate Monitor",
+        "tool": "Datadog",
+        "alertMetric": "error_rate",
+        "thresholdValue": 5,
+        "thresholdUnit": "percent"
+      }
+    },
+    {
+      "id": "db-1",
+      "type": "database",
+      "position": { "x": 1100, "y": 200 },
+      "props": {
+        "label": "PostgreSQL",
+        "replication": 2,
+        "maxRPS": 200,
+        "baseLatencyMs": 15
+      }
+    }
+  ],
+  "connections": [
+    {
+      "source": "webserver-1",
+      "target": "monitor-1",
+      "label": "HTTP Requests",
+      "protocol": "http"
+    },
+    {
+      "source": "monitor-1", 
+      "target": "lb-1",
+      "label": "Monitored Requests",
+      "protocol": "http"
+    },
+    {
+      "source": "lb-1",
+      "target": "microservice-1",
+      "label": "User API",
+      "protocol": "http"
+    },
+    {
+      "source": "lb-1",
+      "target": "microservice-2", 
+      "label": "Order API",
+      "protocol": "http"
+    },
+    {
+      "source": "microservice-1",
+      "target": "monitor-2",
+      "label": "Service Metrics",
+      "protocol": "http"
+    },
+    {
+      "source": "microservice-2",
+      "target": "monitor-2",
+      "label": "Service Metrics", 
+      "protocol": "http"
+    },
+    {
+      "source": "monitor-2",
+      "target": "db-1",
+      "label": "Database Queries",
+      "protocol": "tcp"
+    }
+  ]
+}
--- a/internal/simulation/testdata/simple_design.json
+++ b/internal/simulation/testdata/simple_design.json
@ -16,7 +16,7 @@
				@@ -16,7 +16,7 @@
      "props": {
        "label": "Web Server",
        "instanceSize": "medium",
-        "capacityRPS": 5,
+        "rpsCapacity": 5,
        "baseLatencyMs": 50,
        "penaltyPerRPS": 10
      }
--- a/internal/simulation/testdata/thirdpartyservice_design.json
+++ b/internal/simulation/testdata/thirdpartyservice_design.json
@ -0,0 +1,164 @@
				@@ -0,0 +1,164 @@
+{
+  "nodes": [
+    {
+      "id": "webserver-1",
+      "type": "webserver",
+      "position": { "x": 100, "y": 200 },
+      "props": {
+        "label": "E-commerce API",
+        "rpsCapacity": 200
+      }
+    },
+    {
+      "id": "microservice-1", 
+      "type": "microservice",
+      "position": { "x": 300, "y": 200 },
+      "props": {
+        "label": "Payment Service",
+        "instanceCount": 2,
+        "cpu": 4,
+        "ramGb": 8,
+        "rpsCapacity": 100,
+        "scalingStrategy": "auto"
+      }
+    },
+    {
+      "id": "stripe-service",
+      "type": "third party service",
+      "position": { "x": 500, "y": 150 },
+      "props": {
+        "label": "Stripe Payments",
+        "provider": "Stripe",
+        "latency": 180
+      }
+    },
+    {
+      "id": "twilio-service",
+      "type": "third party service",
+      "position": { "x": 500, "y": 250 },
+      "props": {
+        "label": "SMS Notifications",
+        "provider": "Twilio",
+        "latency": 250
+      }
+    },
+    {
+      "id": "microservice-2",
+      "type": "microservice",
+      "position": { "x": 300, "y": 350 },
+      "props": {
+        "label": "Notification Service",
+        "instanceCount": 1,
+        "cpu": 2,
+        "ramGb": 4,
+        "rpsCapacity": 50,
+        "scalingStrategy": "manual"
+      }
+    },
+    {
+      "id": "sendgrid-service",
+      "type": "third party service",
+      "position": { "x": 500, "y": 350 },
+      "props": {
+        "label": "Email Service",
+        "provider": "SendGrid",
+        "latency": 200
+      }
+    },
+    {
+      "id": "slack-service",
+      "type": "third party service",
+      "position": { "x": 500, "y": 450 },
+      "props": {
+        "label": "Slack Alerts",
+        "provider": "Slack",
+        "latency": 300
+      }
+    },
+    {
+      "id": "monitor-1",
+      "type": "monitoring/alerting",
+      "position": { "x": 700, "y": 200 },
+      "props": {
+        "label": "System Monitor",
+        "tool": "Datadog",
+        "alertMetric": "latency",
+        "thresholdValue": 500,
+        "thresholdUnit": "ms"
+      }
+    },
+    {
+      "id": "db-1",
+      "type": "database",
+      "position": { "x": 700, "y": 350 },
+      "props": {
+        "label": "Transaction DB",
+        "replication": 2,
+        "maxRPS": 300,
+        "baseLatencyMs": 20
+      }
+    }
+  ],
+  "connections": [
+    {
+      "source": "webserver-1",
+      "target": "microservice-1",
+      "label": "Payment Requests",
+      "protocol": "https"
+    },
+    {
+      "source": "microservice-1",
+      "target": "stripe-service",
+      "label": "Process Payment",
+      "protocol": "https"
+    },
+    {
+      "source": "microservice-1",
+      "target": "twilio-service",
+      "label": "SMS Confirmation",
+      "protocol": "https"
+    },
+    {
+      "source": "webserver-1",
+      "target": "microservice-2",
+      "label": "Notification Requests",
+      "protocol": "https"
+    },
+    {
+      "source": "microservice-2",
+      "target": "sendgrid-service",
+      "label": "Send Email",
+      "protocol": "https"
+    },
+    {
+      "source": "microservice-2",
+      "target": "slack-service",
+      "label": "Admin Alerts",
+      "protocol": "https"
+    },
+    {
+      "source": "stripe-service",
+      "target": "monitor-1",
+      "label": "Payment Metrics",
+      "protocol": "http"
+    },
+    {
+      "source": "twilio-service",
+      "target": "monitor-1",
+      "label": "SMS Metrics",
+      "protocol": "http"
+    },
+    {
+      "source": "sendgrid-service",
+      "target": "monitor-1",
+      "label": "Email Metrics",
+      "protocol": "http"
+    },
+    {
+      "source": "monitor-1",
+      "target": "db-1",
+      "label": "Store Metrics",
+      "protocol": "tcp"
+    }
+  ]
+}
--- a/internal/simulation/thirdpartyservice.go
+++ b/internal/simulation/thirdpartyservice.go
@ -0,0 +1,219 @@
				@@ -0,0 +1,219 @@
+package simulation
+
+import (
+	"math/rand"
+)
+
+type ThirdPartyServiceLogic struct{}
+
+type ServiceStatus struct {
+	IsUp          bool
+	LastCheck     int
+	FailureCount  int
+	SuccessCount  int
+	RateLimitHits int
+}
+
+func (t ThirdPartyServiceLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
+	// Extract third-party service properties
+	provider := AsString(props["provider"])
+	if provider == "" {
+		provider = "Generic" // default provider
+	}
+
+	baseLatency := int(AsFloat64(props["latency"]))
+	if baseLatency == 0 {
+		baseLatency = 200 // default 200ms latency
+	}
+
+	// Get service status from props (persistent state)
+	status, ok := props["_serviceStatus"].(ServiceStatus)
+	if !ok {
+		status = ServiceStatus{
+			IsUp:          true,
+			LastCheck:     tick,
+			FailureCount:  0,
+			SuccessCount:  0,
+			RateLimitHits: 0,
+		}
+	}
+
+	currentTime := tick * 100 // Convert tick to milliseconds
+
+	// Simulate service availability and characteristics based on provider
+	reliability := t.getProviderReliability(provider)
+	rateLimitRPS := t.getProviderRateLimit(provider)
+	latencyVariance := t.getProviderLatencyVariance(provider)
+
+	// Check if service is down and should recover
+	if !status.IsUp {
+		// Services typically recover after some time
+		if currentTime-status.LastCheck > 30000 { // 30 seconds downtime
+			status.IsUp = true
+			status.FailureCount = 0
+		}
+	}
+
+	// Apply rate limiting - third-party services often have strict limits
+	requestsThisTick := len(queue)
+	if requestsThisTick > rateLimitRPS {
+		status.RateLimitHits++
+		// Only process up to rate limit
+		queue = queue[:rateLimitRPS]
+	}
+
+	output := []*Request{}
+
+	for _, req := range queue {
+		reqCopy := *req
+
+		// Simulate service availability
+		if !status.IsUp {
+			// Service is down - simulate timeout/error
+			reqCopy.LatencyMS += 10000 // 10 second timeout
+			reqCopy.Path = append(reqCopy.Path, "third-party-timeout")
+			status.FailureCount++
+		} else {
+			// Service is up - calculate response time
+			serviceLatency := t.calculateServiceLatency(provider, baseLatency, latencyVariance)
+
+			// Random failure based on reliability
+			if rand.Float64() > reliability {
+				// Service call failed
+				serviceLatency += 5000 // 5 second timeout on failure
+				reqCopy.Path = append(reqCopy.Path, "third-party-failed")
+				status.FailureCount++
+
+				// If too many failures, mark service as down
+				if status.FailureCount > 5 {
+					status.IsUp = false
+					status.LastCheck = currentTime
+				}
+			} else {
+				// Successful service call
+				reqCopy.Path = append(reqCopy.Path, "third-party-success")
+				status.SuccessCount++
+
+				// Reset failure count on successful calls
+				if status.FailureCount > 0 {
+					status.FailureCount--
+				}
+			}
+
+			reqCopy.LatencyMS += serviceLatency
+		}
+
+		output = append(output, &reqCopy)
+	}
+
+	// Update persistent state
+	props["_serviceStatus"] = status
+
+	// Health check: service is healthy if external service is up and not excessively rate limited
+	// Allow some rate limiting but not too much
+	maxRateLimitHits := 10 // Allow up to 10 rate limit hits before considering unhealthy
+	healthy := status.IsUp && status.RateLimitHits < maxRateLimitHits
+
+	return output, healthy
+}
+
+// getProviderReliability returns the reliability percentage for different providers
+func (t ThirdPartyServiceLogic) getProviderReliability(provider string) float64 {
+	switch provider {
+	case "Stripe":
+		return 0.999 // 99.9% uptime
+	case "Twilio":
+		return 0.998 // 99.8% uptime
+	case "SendGrid":
+		return 0.997 // 99.7% uptime
+	case "AWS":
+		return 0.9995 // 99.95% uptime
+	case "Google":
+		return 0.9999 // 99.99% uptime
+	case "Slack":
+		return 0.995 // 99.5% uptime
+	case "GitHub":
+		return 0.996 // 99.6% uptime
+	case "Shopify":
+		return 0.998 // 99.8% uptime
+	default:
+		return 0.99 // 99% uptime for generic services
+	}
+}
+
+// getProviderRateLimit returns the rate limit (requests per tick) for different providers
+func (t ThirdPartyServiceLogic) getProviderRateLimit(provider string) int {
+	switch provider {
+	case "Stripe":
+		return 100 // 100 requests per second (per tick in our sim)
+	case "Twilio":
+		return 50 // More restrictive
+	case "SendGrid":
+		return 200 // Email is typically higher volume
+	case "AWS":
+		return 1000 // Very high limits
+	case "Google":
+		return 500 // High but controlled
+	case "Slack":
+		return 30 // Very restrictive for chat APIs
+	case "GitHub":
+		return 60 // GitHub API limits
+	case "Shopify":
+		return 80 // E-commerce API limits
+	default:
+		return 100 // Default rate limit
+	}
+}
+
+// getProviderLatencyVariance returns the latency variance factor for different providers
+func (t ThirdPartyServiceLogic) getProviderLatencyVariance(provider string) float64 {
+	switch provider {
+	case "Stripe":
+		return 0.3 // Low variance, consistent performance
+	case "Twilio":
+		return 0.5 // Moderate variance
+	case "SendGrid":
+		return 0.4 // Email services are fairly consistent
+	case "AWS":
+		return 0.2 // Very consistent
+	case "Google":
+		return 0.25 // Very consistent
+	case "Slack":
+		return 0.6 // Chat services can be variable
+	case "GitHub":
+		return 0.4 // Moderate variance
+	case "Shopify":
+		return 0.5 // E-commerce can be variable under load
+	default:
+		return 0.5 // Default variance
+	}
+}
+
+// calculateServiceLatency computes the actual latency including variance
+func (t ThirdPartyServiceLogic) calculateServiceLatency(provider string, baseLatency int, variance float64) int {
+	// Add random variance to base latency
+	varianceMs := float64(baseLatency) * variance
+	randomVariance := (rand.Float64() - 0.5) * 2 * varianceMs // -variance to +variance
+
+	finalLatency := float64(baseLatency) + randomVariance
+
+	// Ensure minimum latency (can't be negative or too low)
+	if finalLatency < 10 {
+		finalLatency = 10
+	}
+
+	// Add provider-specific baseline adjustments
+	switch provider {
+	case "AWS", "Google":
+		// Cloud providers are typically fast
+		finalLatency *= 0.8
+	case "Slack":
+		// Chat APIs can be slower
+		finalLatency *= 1.2
+	case "Twilio":
+		// Telecom APIs have processing overhead
+		finalLatency *= 1.1
+	}
+
+	return int(finalLatency)
+}
--- a/internal/simulation/thirdpartyservice_test.go
+++ b/internal/simulation/thirdpartyservice_test.go
@ -0,0 +1,382 @@
				@@ -0,0 +1,382 @@
+package simulation
+
+import (
+	"testing"
+)
+
+func TestThirdPartyServiceLogic_BasicProcessing(t *testing.T) {
+	logic := ThirdPartyServiceLogic{}
+
+	props := map[string]any{
+		"provider": "Stripe",
+		"latency":  150.0,
+	}
+
+	requests := []*Request{
+		{ID: "1", Type: "POST", LatencyMS: 50, Path: []string{}},
+		{ID: "2", Type: "GET", LatencyMS: 30, Path: []string{}},
+	}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected third party service to be healthy")
+	}
+
+	if len(output) != 2 {
+		t.Errorf("Expected 2 processed requests, got %d", len(output))
+	}
+
+	// Verify latency was added (should be around base latency with some variance)
+	for i, req := range output {
+		originalLatency := requests[i].LatencyMS
+		if req.LatencyMS <= originalLatency {
+			t.Errorf("Expected third party service latency to be added")
+		}
+
+		// Check that path was updated
+		if len(req.Path) == 0 {
+			t.Error("Expected path to be updated")
+		}
+
+		lastPathElement := req.Path[len(req.Path)-1]
+		if lastPathElement != "third-party-success" && lastPathElement != "third-party-failed" {
+			t.Errorf("Expected path to indicate success or failure, got %s", lastPathElement)
+		}
+	}
+}
+
+func TestThirdPartyServiceLogic_ProviderCharacteristics(t *testing.T) {
+	logic := ThirdPartyServiceLogic{}
+
+	providers := []string{"Stripe", "AWS", "Slack", "Twilio"}
+
+	for _, provider := range providers {
+		t.Run(provider, func(t *testing.T) {
+			props := map[string]any{
+				"provider": provider,
+				"latency":  100.0,
+			}
+
+			requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}}
+
+			output, healthy := logic.Tick(props, requests, 1)
+
+			if !healthy {
+				t.Errorf("Expected %s service to be healthy", provider)
+			}
+
+			if len(output) != 1 {
+				t.Errorf("Expected 1 processed request for %s", provider)
+			}
+
+			// Verify latency characteristics
+			addedLatency := output[0].LatencyMS
+			if addedLatency <= 0 {
+				t.Errorf("Expected %s to add latency", provider)
+			}
+
+			// AWS and Google should be faster than Slack
+			if provider == "AWS" && addedLatency > 200 {
+				t.Errorf("Expected AWS to have lower latency, got %dms", addedLatency)
+			}
+		})
+	}
+}
+
+func TestThirdPartyServiceLogic_RateLimiting(t *testing.T) {
+	logic := ThirdPartyServiceLogic{}
+
+	props := map[string]any{
+		"provider": "Slack", // Has low rate limit (30 RPS)
+		"latency":  100.0,
+	}
+
+	// Send more requests than rate limit
+	requests := make([]*Request, 50) // More than Slack's 30 RPS limit
+	for i := range requests {
+		requests[i] = &Request{ID: string(rune('1' + i)), Type: "POST", LatencyMS: 0}
+	}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	// Should only process up to rate limit
+	if len(output) != 30 {
+		t.Errorf("Expected 30 processed requests due to Slack rate limit, got %d", len(output))
+	}
+
+	// Service should still be healthy with rate limiting
+	if !healthy {
+		t.Error("Expected service to be healthy despite rate limiting")
+	}
+
+	// Check that rate limit hits were recorded
+	status, ok := props["_serviceStatus"].(ServiceStatus)
+	if !ok {
+		t.Error("Expected service status to be recorded")
+	}
+
+	if status.RateLimitHits != 1 {
+		t.Errorf("Expected 1 rate limit hit, got %d", status.RateLimitHits)
+	}
+}
+
+func TestThirdPartyServiceLogic_ServiceFailure(t *testing.T) {
+	logic := ThirdPartyServiceLogic{}
+
+	props := map[string]any{
+		"provider": "Generic",
+		"latency":  100.0,
+	}
+
+	// Set up service as already having failures
+	status := ServiceStatus{
+		IsUp:         false,
+		LastCheck:    0,
+		FailureCount: 6,
+	}
+	props["_serviceStatus"] = status
+
+	requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 50, Path: []string{}}}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if healthy {
+		t.Error("Expected service to be unhealthy when external service is down")
+	}
+
+	if len(output) != 1 {
+		t.Error("Expected request to be processed even when service is down")
+	}
+
+	// Should have very high latency due to timeout
+	if output[0].LatencyMS < 5000 {
+		t.Errorf("Expected high latency for service failure, got %dms", output[0].LatencyMS)
+	}
+
+	// Check path indicates timeout
+	lastPath := output[0].Path[len(output[0].Path)-1]
+	if lastPath != "third-party-timeout" {
+		t.Errorf("Expected timeout path, got %s", lastPath)
+	}
+}
+
+func TestThirdPartyServiceLogic_ServiceRecovery(t *testing.T) {
+	logic := ThirdPartyServiceLogic{}
+
+	props := map[string]any{
+		"provider": "Stripe",
+		"latency":  100.0,
+	}
+
+	// Set up service as down but with old timestamp (should recover)
+	status := ServiceStatus{
+		IsUp:         false,
+		LastCheck:    0, // Very old timestamp
+		FailureCount: 3,
+	}
+	props["_serviceStatus"] = status
+
+	requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 50, Path: []string{}}}
+
+	// Run with current tick that's more than 30 seconds later
+	_, healthy := logic.Tick(props, requests, 400) // 40 seconds later
+
+	if !healthy {
+		t.Error("Expected service to be healthy after recovery")
+	}
+
+	// Check that service recovered
+	updatedStatus, ok := props["_serviceStatus"].(ServiceStatus)
+	if !ok {
+		t.Error("Expected updated service status")
+	}
+
+	if !updatedStatus.IsUp {
+		t.Error("Expected service to have recovered")
+	}
+
+	if updatedStatus.FailureCount != 0 {
+		t.Error("Expected failure count to be reset on recovery")
+	}
+}
+
+func TestThirdPartyServiceLogic_ReliabilityDifferences(t *testing.T) {
+	logic := ThirdPartyServiceLogic{}
+
+	// Test different reliability levels
+	testCases := []struct {
+		provider            string
+		expectedReliability float64
+	}{
+		{"AWS", 0.9995},
+		{"Google", 0.9999},
+		{"Stripe", 0.999},
+		{"Slack", 0.995},
+		{"Generic", 0.99},
+	}
+
+	for _, tc := range testCases {
+		reliability := logic.getProviderReliability(tc.provider)
+		if reliability != tc.expectedReliability {
+			t.Errorf("Expected %s reliability %.4f, got %.4f",
+				tc.provider, tc.expectedReliability, reliability)
+		}
+	}
+}
+
+func TestThirdPartyServiceLogic_RateLimitDifferences(t *testing.T) {
+	logic := ThirdPartyServiceLogic{}
+
+	// Test different rate limits
+	testCases := []struct {
+		provider      string
+		expectedLimit int
+	}{
+		{"AWS", 1000},
+		{"Stripe", 100},
+		{"Slack", 30},
+		{"SendGrid", 200},
+		{"Twilio", 50},
+	}
+
+	for _, tc := range testCases {
+		rateLimit := logic.getProviderRateLimit(tc.provider)
+		if rateLimit != tc.expectedLimit {
+			t.Errorf("Expected %s rate limit %d, got %d",
+				tc.provider, tc.expectedLimit, rateLimit)
+		}
+	}
+}
+
+func TestThirdPartyServiceLogic_LatencyVariance(t *testing.T) {
+	logic := ThirdPartyServiceLogic{}
+
+	props := map[string]any{
+		"provider": "Stripe",
+		"latency":  100.0,
+	}
+
+	requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}}
+
+	latencies := []int{}
+
+	// Run multiple times to observe variance
+	for i := 0; i < 10; i++ {
+		output, _ := logic.Tick(props, requests, i)
+		latencies = append(latencies, output[0].LatencyMS)
+	}
+
+	// Check that we have variance (not all latencies are the same)
+	allSame := true
+	firstLatency := latencies[0]
+	for _, latency := range latencies[1:] {
+		if latency != firstLatency {
+			allSame = false
+			break
+		}
+	}
+
+	if allSame {
+		t.Error("Expected latency variance, but all latencies were the same")
+	}
+
+	// All latencies should be reasonable (between 50ms and 300ms for Stripe)
+	for _, latency := range latencies {
+		if latency < 50 || latency > 300 {
+			t.Errorf("Expected reasonable latency for Stripe, got %dms", latency)
+		}
+	}
+}
+
+func TestThirdPartyServiceLogic_DefaultValues(t *testing.T) {
+	logic := ThirdPartyServiceLogic{}
+
+	// Empty props should use defaults
+	props := map[string]any{}
+
+	requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}}
+
+	output, healthy := logic.Tick(props, requests, 1)
+
+	if !healthy {
+		t.Error("Expected service to be healthy with default values")
+	}
+
+	if len(output) != 1 {
+		t.Error("Expected 1 processed request with defaults")
+	}
+
+	// Should have reasonable default latency (around 200ms base)
+	if output[0].LatencyMS < 100 || output[0].LatencyMS > 400 {
+		t.Errorf("Expected reasonable default latency, got %dms", output[0].LatencyMS)
+	}
+}
+
+func TestThirdPartyServiceLogic_SuccessCountTracking(t *testing.T) {
+	logic := ThirdPartyServiceLogic{}
+
+	props := map[string]any{
+		"provider": "AWS", // High reliability
+		"latency":  50.0,
+	}
+
+	requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}}
+
+	// Run multiple successful requests
+	for i := 0; i < 5; i++ {
+		logic.Tick(props, requests, i)
+	}
+
+	status, ok := props["_serviceStatus"].(ServiceStatus)
+	if !ok {
+		t.Error("Expected service status to be tracked")
+	}
+
+	// Should have accumulated success count
+	if status.SuccessCount == 0 {
+		t.Error("Expected success count to be tracked")
+	}
+
+	// Should be healthy
+	if !status.IsUp {
+		t.Error("Expected service to remain up with successful calls")
+	}
+}
+
+func TestThirdPartyServiceLogic_FailureRecovery(t *testing.T) {
+	logic := ThirdPartyServiceLogic{}
+
+	props := map[string]any{
+		"provider": "Generic",
+		"latency":  100.0,
+	}
+
+	// Set up service with some failures but still up
+	status := ServiceStatus{
+		IsUp:         true,
+		FailureCount: 3,
+		SuccessCount: 0,
+	}
+	props["_serviceStatus"] = status
+
+	requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}}
+
+	// Simulate a successful call (with high probability for Generic service)
+	// We'll run this multiple times to ensure we get at least one success
+	successFound := false
+	for i := 0; i < 10 && !successFound; i++ {
+		output, _ := logic.Tick(props, requests, i)
+		if len(output[0].Path) > 0 && output[0].Path[len(output[0].Path)-1] == "third-party-success" {
+			successFound = true
+		}
+	}
+
+	if successFound {
+		updatedStatus, _ := props["_serviceStatus"].(ServiceStatus)
+		// Failure count should have decreased
+		if updatedStatus.FailureCount >= 3 {
+			t.Error("Expected failure count to decrease after successful call")
+		}
+	}
+}
--- a/internal/simulation/webserver.go
+++ b/internal/simulation/webserver.go
@ -6,7 +6,7 @@ type WebServerLogic struct {
				@@ -6,7 +6,7 @@ type WebServerLogic struct {
 }

 func (l WebServerLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
-	maxRPS := int(AsFloat64(props["capacityRPS"]))
+	maxRPS := int(AsFloat64(props["rpsCapacity"]))

 	toProcess := queue
 	if len(queue) > maxRPS {
--- a/router/handlers/simulation.go
+++ b/router/handlers/simulation.go
@ -2,17 +2,24 @@ package handlers
				@@ -2,17 +2,24 @@ package handlers

 import (
 	"encoding/json"
+	"fmt"
 	"net/http"
 	"systemdesigngame/internal/design"
+	"systemdesigngame/internal/level"
+	"systemdesigngame/internal/simulation"
 )

 type SimulationHandler struct{}

 type SimulationResponse struct {
-	Success  bool                   `json:"success"`
-	Metrics  map[string]interface{} `json:"metrics,omitempty"`
-	Timeline []interface{}          `json:"timeline,omitempty"`
-	Error    string                 `json:"error,omitempty"`
+	Success   bool                   `json:"success"`
+	Metrics   map[string]interface{} `json:"metrics,omitempty"`
+	Timeline  []interface{}          `json:"timeline,omitempty"`
+	Passed    bool                   `json:"passed,omitempty"`
+	Score     int                    `json:"score,omitempty"`
+	Feedback  []string               `json:"feedback,omitempty"`
+	LevelName string                 `json:"levelName,omitempty"`
+	Error     string                 `json:"error,omitempty"`
 }

 func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
@ -21,22 +28,96 @@ func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
				@@ -21,22 +28,96 @@ func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	var design design.Design
-	if err := json.NewDecoder(r.Body).Decode(&design); err != nil {
-		http.Error(w, "Invalid design JSON: "+err.Error(), http.StatusBadRequest)
+	var requestBody struct {
+		Design     design.Design `json:"design"`
+		LevelName  string        `json:"levelName,omitempty"`
+		Difficulty string        `json:"difficulty,omitempty"`
+	}
+
+	if err := json.NewDecoder(r.Body).Decode(&requestBody); err != nil {
+		// Try to decode as just design for backward compatibility
+		r.Body.Close()
+		var design design.Design
+		if err2 := json.NewDecoder(r.Body).Decode(&design); err2 != nil {
+			http.Error(w, "Invalid request JSON: "+err.Error(), http.StatusBadRequest)
+			return
+		}
+		requestBody.Design = design
+	}
+
+	// Extract the design for processing
+	design := requestBody.Design
+
+	// Run the actual simulation
+	engine := simulation.NewEngineFromDesign(design, 100)
+	if engine == nil {
+		response := SimulationResponse{
+			Success: false,
+			Error:   "Failed to create simulation engine - no valid components found",
+		}
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(response)
 		return
 	}

-	// For now, return a mock successful response but eventually, we want to go to the results page(s)
+	// Set simulation parameters
+	engine.RPS = 50 // Default RPS - could be configurable later
+
+	// Find entry node by analyzing topology
+	entryNode := findEntryNode(design)
+
+	if entryNode == "" {
+		response := SimulationResponse{
+			Success: false,
+			Error:   "No entry point found - design must include a component with no incoming connections (webserver, microservice, load balancer, etc.)",
+		}
+		w.Header().Set("Content-Type", "application/json")
+		json.NewEncoder(w).Encode(response)
+		return
+	}
+
+	engine.EntryNode = entryNode
+
+	// Run simulation for 60 ticks (6 seconds at 100ms per tick)
+	snapshots := engine.Run(60, 100)
+
+	// Calculate metrics from snapshots
+	metrics := calculateMetrics(snapshots)
+
+	// Convert snapshots to interface{} for JSON serialization
+	timeline := make([]interface{}, len(snapshots))
+	for i, snapshot := range snapshots {
+		timeline[i] = snapshot
+	}
+
+	// Perform level validation if level info provided
+	var passed bool
+	var score int
+	var feedback []string
+	var levelName string
+
+	if requestBody.LevelName != "" {
+		difficulty := level.DifficultyEasy // default
+		if requestBody.Difficulty != "" {
+			difficulty = level.Difficulty(requestBody.Difficulty)
+		}
+
+		if lvl, err := level.GetLevel(requestBody.LevelName, difficulty); err == nil {
+			levelName = lvl.Name
+			passed, score, feedback = validateLevel(lvl, design, metrics)
+		} else {
+			feedback = []string{"Warning: Level not found, simulation ran without validation"}
+		}
+	}
+
 	response := SimulationResponse{
-		Success: true,
-		Metrics: map[string]interface{}{
-			"throughput":   250,
-			"latency_p95":  85,
-			"cost_monthly": 120,
-			"availability": 99.5,
-		},
-		Timeline: []interface{}{}, // Will contain TickSnapshots later
+		Success:   true,
+		Metrics:   metrics,
+		Timeline:  timeline,
+		Passed:    passed,
+		Score:     score,
+		Feedback:  feedback,
+		LevelName: levelName,
 	}

 	w.Header().Set("Content-Type", "application/json")
@ -45,3 +126,312 @@ func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
				@@ -45,3 +126,312 @@ func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 		return
 	}
 }
+
+// calculateMetrics computes key performance metrics from simulation snapshots
+func calculateMetrics(snapshots []*simulation.TickSnapshot) map[string]interface{} {
+	if len(snapshots) == 0 {
+		return map[string]interface{}{
+			"throughput":   0,
+			"latency_avg":  0,
+			"cost_monthly": 0,
+			"availability": 0,
+		}
+	}
+
+	totalRequests := 0
+	totalLatency := 0
+	totalHealthy := 0
+	totalNodes := 0
+
+	// Calculate aggregate metrics across all snapshots
+	for _, snapshot := range snapshots {
+		// Count total requests processed in this tick
+		for _, requests := range snapshot.Emitted {
+			totalRequests += len(requests)
+			for _, req := range requests {
+				totalLatency += req.LatencyMS
+			}
+		}
+
+		// Count healthy vs total nodes
+		for _, healthy := range snapshot.NodeHealth {
+			totalNodes++
+			if healthy {
+				totalHealthy++
+			}
+		}
+	}
+
+	// Calculate throughput (requests per second)
+	// snapshots represent 6 seconds of simulation (60 ticks * 100ms)
+	simulationSeconds := float64(len(snapshots)) * 0.1 // 100ms per tick
+	throughput := float64(totalRequests) / simulationSeconds
+
+	// Calculate average latency
+	avgLatency := 0.0
+	if totalRequests > 0 {
+		avgLatency = float64(totalLatency) / float64(totalRequests)
+	}
+
+	// Calculate availability percentage
+	availability := 0.0
+	if totalNodes > 0 {
+		availability = (float64(totalHealthy) / float64(totalNodes)) * 100
+	}
+
+	// Estimate monthly cost (placeholder - could be enhanced)
+	monthlyCost := float64(totalNodes) * 50 // $50 per node per month baseline
+
+	return map[string]interface{}{
+		"throughput":   int(throughput),
+		"latency_avg":  int(avgLatency),
+		"cost_monthly": int(monthlyCost),
+		"availability": availability,
+	}
+}
+
+// findEntryNode analyzes the design topology to find the best entry point
+func findEntryNode(design design.Design) string {
+	// Build map of incoming connections
+	incomingCount := make(map[string]int)
+
+	// Initialize all nodes with 0 incoming connections
+	for _, node := range design.Nodes {
+		incomingCount[node.ID] = 0
+	}
+
+	// Count incoming connections for each node
+	for _, conn := range design.Connections {
+		incomingCount[conn.Target]++
+	}
+
+	// Find nodes with no incoming connections (potential entry points)
+	var entryPoints []string
+	for nodeID, count := range incomingCount {
+		if count == 0 {
+			entryPoints = append(entryPoints, nodeID)
+		}
+	}
+
+	// If multiple entry points exist, prefer certain types
+	if len(entryPoints) > 1 {
+		return preferredEntryPoint(design.Nodes, entryPoints)
+	} else if len(entryPoints) == 1 {
+		return entryPoints[0]
+	}
+
+	return "" // No entry point found
+}
+
+// preferredEntryPoint selects the best entry point from candidates based on component type
+func preferredEntryPoint(nodes []design.Node, candidateIDs []string) string {
+	// Priority order for entry points (most logical first)
+	priority := []string{
+		"webserver",
+		"microservice",
+		"loadBalancer",  // Could be edge load balancer
+		"cdn",           // Edge CDN
+		"data pipeline", // Data ingestion entry
+		"messageQueue",  // For event-driven architectures
+	}
+
+	// Create lookup for candidate nodes
+	candidates := make(map[string]design.Node)
+	for _, node := range nodes {
+		for _, id := range candidateIDs {
+			if node.ID == id {
+				candidates[id] = node
+				break
+			}
+		}
+	}
+
+	// Find highest priority candidate
+	for _, nodeType := range priority {
+		for id, node := range candidates {
+			if node.Type == nodeType {
+				return id
+			}
+		}
+	}
+
+	// If no preferred type, return first candidate
+	if len(candidateIDs) > 0 {
+		return candidateIDs[0]
+	}
+
+	return ""
+}
+
+// validateLevel checks if the design and simulation results meet level requirements
+func validateLevel(lvl *level.Level, design design.Design, metrics map[string]interface{}) (bool, int, []string) {
+	var feedback []string
+	var failedRequirements []string
+	var passedRequirements []string
+
+	// Extract metrics
+	throughput := metrics["throughput"].(int)
+	avgLatency := metrics["latency_avg"].(int)
+	availability := metrics["availability"].(float64)
+	monthlyCost := metrics["cost_monthly"].(int)
+
+	// Check throughput requirement
+	if throughput >= lvl.TargetRPS {
+		passedRequirements = append(passedRequirements, "Throughput requirement met")
+	} else {
+		failedRequirements = append(failedRequirements,
+			fmt.Sprintf("Throughput: %d RPS (required: %d RPS)", throughput, lvl.TargetRPS))
+	}
+
+	// Check latency requirement (using avg latency as approximation for P95)
+	if avgLatency <= lvl.MaxP95LatencyMs {
+		passedRequirements = append(passedRequirements, "Latency requirement met")
+	} else {
+		failedRequirements = append(failedRequirements,
+			fmt.Sprintf("Latency: %dms (max allowed: %dms)", avgLatency, lvl.MaxP95LatencyMs))
+	}
+
+	// Check availability requirement
+	if availability >= lvl.RequiredAvailabilityPct {
+		passedRequirements = append(passedRequirements, "Availability requirement met")
+	} else {
+		failedRequirements = append(failedRequirements,
+			fmt.Sprintf("Availability: %.1f%% (required: %.1f%%)", availability, lvl.RequiredAvailabilityPct))
+	}
+
+	// Check cost requirement
+	if monthlyCost <= lvl.MaxMonthlyUSD {
+		passedRequirements = append(passedRequirements, "Cost requirement met")
+	} else {
+		failedRequirements = append(failedRequirements,
+			fmt.Sprintf("Cost: $%d/month (max allowed: $%d/month)", monthlyCost, lvl.MaxMonthlyUSD))
+	}
+
+	// Check component requirements
+	componentFeedback := validateComponentRequirements(lvl, design)
+	if len(componentFeedback.Failed) > 0 {
+		failedRequirements = append(failedRequirements, componentFeedback.Failed...)
+	}
+	if len(componentFeedback.Passed) > 0 {
+		passedRequirements = append(passedRequirements, componentFeedback.Passed...)
+	}
+
+	// Determine if passed
+	passed := len(failedRequirements) == 0
+
+	// Calculate score (0-100)
+	score := calculateScore(len(passedRequirements), len(failedRequirements), metrics)
+
+	// Build feedback
+	if passed {
+		feedback = append(feedback, "Level completed successfully!")
+		feedback = append(feedback, "")
+		feedback = append(feedback, passedRequirements...)
+	} else {
+		feedback = append(feedback, "Level failed - requirements not met:")
+		feedback = append(feedback, "")
+		feedback = append(feedback, failedRequirements...)
+		if len(passedRequirements) > 0 {
+			feedback = append(feedback, "")
+			feedback = append(feedback, "Requirements passed:")
+			feedback = append(feedback, passedRequirements...)
+		}
+	}
+
+	return passed, score, feedback
+}
+
+type ComponentValidationResult struct {
+	Passed []string
+	Failed []string
+}
+
+// validateComponentRequirements checks mustInclude, mustNotInclude, etc.
+func validateComponentRequirements(lvl *level.Level, design design.Design) ComponentValidationResult {
+	result := ComponentValidationResult{}
+
+	// Build map of component types in design
+	componentTypes := make(map[string]int)
+	for _, node := range design.Nodes {
+		componentTypes[node.Type]++
+	}
+
+	// Check mustInclude requirements
+	for _, required := range lvl.MustInclude {
+		if count, exists := componentTypes[required]; exists && count > 0 {
+			result.Passed = append(result.Passed, fmt.Sprintf("Required component '%s' included", required))
+		} else {
+			result.Failed = append(result.Failed, fmt.Sprintf("Missing required component: '%s'", required))
+		}
+	}
+
+	// Check mustNotInclude requirements
+	for _, forbidden := range lvl.MustNotInclude {
+		if count, exists := componentTypes[forbidden]; exists && count > 0 {
+			result.Failed = append(result.Failed, fmt.Sprintf("Forbidden component used: '%s'", forbidden))
+		}
+	}
+
+	// Check minReplicas requirements
+	for component, minCount := range lvl.MinReplicas {
+		if count, exists := componentTypes[component]; exists && count >= minCount {
+			result.Passed = append(result.Passed, fmt.Sprintf("Sufficient '%s' replicas (%d)", component, count))
+		} else {
+			actualCount := 0
+			if exists {
+				actualCount = count
+			}
+			result.Failed = append(result.Failed,
+				fmt.Sprintf("Insufficient '%s' replicas: %d (minimum: %d)", component, actualCount, minCount))
+		}
+	}
+
+	return result
+}
+
+// calculateScore computes a score from 0-100 based on performance
+func calculateScore(passedCount, failedCount int, metrics map[string]interface{}) int {
+	if failedCount > 0 {
+		// Failed level - score based on how many requirements passed
+		return (passedCount * 100) / (passedCount + failedCount)
+	}
+
+	// Passed level - bonus points for performance
+	baseScore := 70 // Base score for passing
+
+	// Performance bonuses (up to 30 points)
+	performanceBonus := 0
+
+	// Throughput bonus (higher throughput = better)
+	if throughput, ok := metrics["throughput"].(int); ok && throughput > 0 {
+		performanceBonus += min(10, throughput/100) // 1 point per 100 RPS, max 10
+	}
+
+	// Availability bonus (higher availability = better)
+	if availability, ok := metrics["availability"].(float64); ok {
+		if availability >= 99.9 {
+			performanceBonus += 10
+		} else if availability >= 99.5 {
+			performanceBonus += 5
+		}
+	}
+
+	// Cost efficiency bonus (lower cost = better)
+	if cost, ok := metrics["cost_monthly"].(int); ok && cost > 0 {
+		if cost <= 50 {
+			performanceBonus += 10
+		} else if cost <= 100 {
+			performanceBonus += 5
+		}
+	}
+
+	return min(100, baseScore+performanceBonus)
+}
+
+// Helper function
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
--- a/router/router.go
+++ b/router/router.go
@ -18,6 +18,7 @@ func SetupRoutes(tmpl *template.Template) *http.ServeMux {
				@@ -18,6 +18,7 @@ func SetupRoutes(tmpl *template.Template) *http.ServeMux {
 	mux.Handle("/simulate", auth.RequireAuth(&handlers.SimulationHandler{}))
 	mux.HandleFunc("/login", auth.LoginHandler)
 	mux.HandleFunc("/callback", auth.CallbackHandler)
+	mux.HandleFunc("/ws", handlers.Messages)

 	return mux
 }
--- a/static/app.js
+++ b/static/app.js
@ -91,9 +91,54 @@ export class CanvasApp {
				@@ -91,9 +91,54 @@ export class CanvasApp {
            node.y = y;
        });

-        this.runButton.addEventListener('click', () => {
+        this.runButton.addEventListener('click', async () => {
            const designData = this.exportDesign();
-            console.log(JSON.stringify(designData))
+            
+            // Try to get level info from URL or page context
+            const levelInfo = this.getLevelInfo();
+            
+            const requestBody = {
+                design: designData,
+                ...levelInfo
+            };
+            
+            console.log('Sending design to simulation:', JSON.stringify(requestBody));
+            
+            // Disable button and show loading state
+            this.runButton.disabled = true;
+            this.runButton.textContent = 'Running Simulation...';
+            
+            try {
+                const response = await fetch('/simulate', {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify(requestBody)
+                });
+                
+                if (!response.ok) {
+                    throw new Error(`HTTP ${response.status}: ${response.statusText}`);
+                }
+                
+                const result = await response.json();
+                
+                if (result.Success) {
+                    console.log('Simulation successful:', result);
+                    this.showResults(result);
+                } else {
+                    console.error('Simulation failed:', result.Error);
+                    this.showError(result.Error || 'Simulation failed');
+                }
+                
+            } catch (error) {
+                console.error('Network error:', error);
+                this.showError('Failed to run simulation: ' + error.message);
+            } finally {
+                // Re-enable button
+                this.runButton.disabled = false;
+                this.runButton.textContent = 'Test Design';
+            }
        });

        this.canvas.addEventListener('click', () => {
@ -267,4 +312,57 @@ export class CanvasApp {
				@@ -267,4 +312,57 @@ export class CanvasApp {

        return { nodes, connections };
    }
+
+    getLevelInfo() {
+        // Try to extract level info from URL path like /play/url-shortener
+        const pathParts = window.location.pathname.split('/');
+        if (pathParts.length >= 3 && pathParts[1] === 'play') {
+            const levelName = decodeURIComponent(pathParts[2]);
+            return {
+                levelName: levelName,
+                difficulty: 'easy' // Default difficulty, could be enhanced later
+            };
+        }
+        return {};
+    }
+
+    showResults(result) {
+        const metrics = result.Metrics;
+        let message = '';
+        
+        // Level validation results
+        if (result.LevelName) {
+            if (result.Passed) {
+                message += `Level "${result.LevelName}" PASSED!\n`;
+                message += `Score: ${result.Score}/100\n\n`;
+            } else {
+                message += `Level "${result.LevelName}" FAILED\n`;
+                message += `Score: ${result.Score}/100\n\n`;
+            }
+            
+            // Add detailed feedback
+            if (result.Feedback && result.Feedback.length > 0) {
+                message += result.Feedback.join('\n') + '\n\n';
+            }
+        } else {
+            message += `Simulation Complete!\n\n`;
+        }
+        
+        // Performance metrics
+        message += `Performance Metrics:\n`;
+        message += `• Throughput: ${metrics.throughput} req/sec\n`;
+        message += `• Avg Latency: ${metrics.latency_avg}ms\n`;
+        message += `• Availability: ${metrics.availability.toFixed(1)}%\n`;
+        message += `• Monthly Cost: $${metrics.cost_monthly}\n\n`;
+        message += `Timeline: ${result.Timeline.length} ticks simulated`;
+        
+        alert(message);
+        
+        // TODO: Later replace with redirect to results page or modal
+        console.log('Full simulation data:', result);
+    }
+
+    showError(errorMessage) {
+        alert(`Simulation Error:\n\n${errorMessage}\n\nPlease check your design and try again.`);
+    }
 }
--- a/static/plugins/database.js
+++ b/static/plugins/database.js
@ -5,6 +5,8 @@ PluginRegistry.register('database', {
				@@ -5,6 +5,8 @@ PluginRegistry.register('database', {
    label: 'Database',
    props: [
        { name: 'label', type: 'string', default: 'Database', group: 'label-group' },
-        { name: 'replication', type: 'number', default: 1, group: 'db-group' }
+        { name: 'replication', type: 'number', default: 1, group: 'db-group' },
+        { name: 'maxRPS', type: 'number', default: 1000, group: 'db-group' },
+        { name: 'baseLatencyMs', type: 'number', default: 10, group: 'db-group' }
    ]
 });
--- a/static/plugins/messageQueue.js
+++ b/static/plugins/messageQueue.js
@ -6,6 +6,7 @@ PluginRegistry.register('messageQueue', {
				@@ -6,6 +6,7 @@ PluginRegistry.register('messageQueue', {
    props: [
        { name: 'label', type: 'string', default: 'MQ', group: 'label-group' },
        { name: 'queueCapacity', type: 'number', default: 10000, group: 'mq-group' },
-        { name: 'retentionSeconds', type: 'number', default: 600, group: 'mq-group' }
+        { name: 'retentionSeconds', type: 'number', default: 600, group: 'mq-group' },
+        { name: 'processingRate', type: 'number', default: 100, group: 'mq-group' }
    ]
 });
--- a/static/plugins/monitorAlerting.js
+++ b/static/plugins/monitorAlerting.js
@ -6,6 +6,8 @@ PluginRegistry.register('monitoring/alerting', {
				@@ -6,6 +6,8 @@ PluginRegistry.register('monitoring/alerting', {
    props: [
        { name: 'label', type: 'string', default: 'monitor', group: 'label-group' },
        { name: 'tool', type: 'string', default: 'Prometheus', group: 'monitor-group' },
-        { name: 'alertThreshold', type: 'number', default: 80, group: 'monitor-group' }
+        { name: 'alertMetric', type: 'string', default: 'latency', group: 'monitor-group' },
+        { name: 'thresholdValue', type: 'number', default: 80, group: 'monitor-group' },
+        { name: 'thresholdUnit', type: 'string', default: 'ms', group: 'monitor-group' }
    ]
 });