diff --git a/internal/design/design.go b/internal/design/design.go index 9db411c..c032627 100644 --- a/internal/design/design.go +++ b/internal/design/design.go @@ -10,8 +10,8 @@ type Node struct { } type Position struct { - X int `json:"x"` - Y int `json:"y"` + X float64 `json:"x"` + Y float64 `json:"y"` } type Connection struct { @@ -46,8 +46,10 @@ type CDN struct { } type Database struct { - Label string `json:"label"` - Replication int `json:"replication"` + Label string `json:"label"` + Replication int `json:"replication"` + MaxRPS int `json:"maxRPS"` + BaseLatencyMs int `json:"baseLatencyMs"` } type DataPipeline struct { @@ -65,13 +67,14 @@ type MessageQueue struct { Label string `json:"label"` QueueCapacity int `json:"queueCapacity"` RetentionSeconds int `json:"retentionSeconds"` + ProcessingRate int `json:"processingRate"` } type Microservice struct { Label string `json:"label"` InstanceCount int `json:"instanceCount"` CPU int `json:"cpu"` - RAMGb int `json:"ramGb"` + RamGb int `json:"ramGb"` RPSCapacity int `json:"rpsCapacity"` MonthlyUSD int `json:"monthlyUsd"` ScalingStrategy string `json:"scalingStrategy"` diff --git a/internal/simulation/cache.go b/internal/simulation/cache.go new file mode 100644 index 0000000..950ee2d --- /dev/null +++ b/internal/simulation/cache.go @@ -0,0 +1,180 @@ +package simulation + +import ( + "time" +) + +type CacheLogic struct{} + +type CacheEntry struct { + Data string + Timestamp int + AccessTime int + AccessCount int + InsertOrder int +} + +func (c CacheLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) { + // Extract cache properties + cacheTTL := int(AsFloat64(props["cacheTTL"])) + if cacheTTL == 0 { + cacheTTL = 300000 // default 5 minutes in ms + } + + maxEntries := int(AsFloat64(props["maxEntries"])) + if maxEntries == 0 { + maxEntries = 1000 // default max entries + } + + evictionPolicy := AsString(props["evictionPolicy"]) + if evictionPolicy == "" { + evictionPolicy = "LRU" // default eviction policy + } + + // Initialize cache data structures in props + cacheData, ok := props["_cacheData"].(map[string]*CacheEntry) + if !ok { + cacheData = make(map[string]*CacheEntry) + props["_cacheData"] = cacheData + } + + insertCounter, ok := props["_insertCounter"].(int) + if !ok { + insertCounter = 0 + } + + // Current timestamp for this tick + currentTime := tick * 100 // assuming 100ms per tick + + // Clean up expired entries first + c.cleanExpiredEntries(cacheData, currentTime, cacheTTL) + + output := []*Request{} + + for _, req := range queue { + cacheKey := req.ID + "-" + req.Type // Use request ID and type as cache key + + // Check for cache hit + entry, hit := cacheData[cacheKey] + if hit && !c.isExpired(entry, currentTime, cacheTTL) { + // Cache hit - return immediately with minimal latency + reqCopy := *req + reqCopy.LatencyMS += 1 // 1ms for in-memory access + reqCopy.Path = append(reqCopy.Path, "cache-hit") + + // Update access tracking for eviction policies + entry.AccessTime = currentTime + entry.AccessCount++ + + output = append(output, &reqCopy) + } else { + // Cache miss - forward request downstream + reqCopy := *req + reqCopy.Path = append(reqCopy.Path, "cache-miss") + + // For simulation purposes, we'll cache the "response" immediately + // In a real system, this would happen when the response comes back + insertCounter++ + newEntry := &CacheEntry{ + Data: "cached-data", // In real implementation, this would be the response data + Timestamp: currentTime, + AccessTime: currentTime, + AccessCount: 1, + InsertOrder: insertCounter, + } + + // First check if we need to evict before adding + if len(cacheData) >= maxEntries { + c.evictEntry(cacheData, evictionPolicy) + } + + // Now add the new entry + cacheData[cacheKey] = newEntry + + output = append(output, &reqCopy) + } + } + + // Update insert counter in props + props["_insertCounter"] = insertCounter + + return output, true +} + +func (c CacheLogic) cleanExpiredEntries(cacheData map[string]*CacheEntry, currentTime, ttl int) { + for key, entry := range cacheData { + if c.isExpired(entry, currentTime, ttl) { + delete(cacheData, key) + } + } +} + +func (c CacheLogic) isExpired(entry *CacheEntry, currentTime, ttl int) bool { + return (currentTime - entry.Timestamp) > ttl +} + +func (c CacheLogic) evictEntry(cacheData map[string]*CacheEntry, policy string) { + if len(cacheData) == 0 { + return + } + + var keyToEvict string + + switch policy { + case "LRU": + // Evict least recently used + oldestTime := int(^uint(0) >> 1) // Max int + for key, entry := range cacheData { + if entry.AccessTime < oldestTime { + oldestTime = entry.AccessTime + keyToEvict = key + } + } + + case "LFU": + // Evict least frequently used + minCount := int(^uint(0) >> 1) // Max int + for key, entry := range cacheData { + if entry.AccessCount < minCount { + minCount = entry.AccessCount + keyToEvict = key + } + } + + case "FIFO": + // Evict first in (oldest insert order) + minOrder := int(^uint(0) >> 1) // Max int + for key, entry := range cacheData { + if entry.InsertOrder < minOrder { + minOrder = entry.InsertOrder + keyToEvict = key + } + } + + case "random": + // Evict random entry + keys := make([]string, 0, len(cacheData)) + for key := range cacheData { + keys = append(keys, key) + } + if len(keys) > 0 { + // Use timestamp as pseudo-random seed + seed := time.Now().UnixNano() + keyToEvict = keys[seed%int64(len(keys))] + } + + default: + // Default to LRU + oldestTime := int(^uint(0) >> 1) + for key, entry := range cacheData { + if entry.AccessTime < oldestTime { + oldestTime = entry.AccessTime + keyToEvict = key + } + } + } + + if keyToEvict != "" { + delete(cacheData, keyToEvict) + } +} diff --git a/internal/simulation/cache_test.go b/internal/simulation/cache_test.go new file mode 100644 index 0000000..248d3e2 --- /dev/null +++ b/internal/simulation/cache_test.go @@ -0,0 +1,319 @@ +package simulation + +import ( + "testing" +) + +func TestCacheLogic_CacheHitMiss(t *testing.T) { + cache := CacheLogic{} + + props := map[string]any{ + "cacheTTL": 10000, // 10 seconds + "maxEntries": 100, + "evictionPolicy": "LRU", + } + + // First request should be a miss + req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0, Path: []string{"start"}}} + output1, alive := cache.Tick(props, req1, 1) + + if !alive { + t.Errorf("Cache should be alive") + } + + if len(output1) != 1 { + t.Errorf("Expected 1 output request, got %d", len(output1)) + } + + // Should be cache miss + if output1[0].LatencyMS != 0 { // No latency added for miss + t.Errorf("Expected 0ms latency for cache miss, got %dms", output1[0].LatencyMS) + } + + // Check path contains cache-miss + found := false + for _, pathItem := range output1[0].Path { + if pathItem == "cache-miss" { + found = true + break + } + } + if !found { + t.Errorf("Expected cache-miss in path, got %v", output1[0].Path) + } + + // Second identical request should be a hit + req2 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0, Path: []string{"start"}}} + output2, _ := cache.Tick(props, req2, 2) + + if len(output2) != 1 { + t.Errorf("Expected 1 output request, got %d", len(output2)) + } + + // Should be cache hit with 1ms latency + if output2[0].LatencyMS != 1 { + t.Errorf("Expected 1ms latency for cache hit, got %dms", output2[0].LatencyMS) + } + + // Check path contains cache-hit + found = false + for _, pathItem := range output2[0].Path { + if pathItem == "cache-hit" { + found = true + break + } + } + if !found { + t.Errorf("Expected cache-hit in path, got %v", output2[0].Path) + } +} + +func TestCacheLogic_TTLExpiration(t *testing.T) { + cache := CacheLogic{} + + props := map[string]any{ + "cacheTTL": 1000, // 1 second + "maxEntries": 100, + "evictionPolicy": "LRU", + } + + // First request - cache miss + req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req1, 1) + + // Second request within TTL - cache hit + req2 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + output2, _ := cache.Tick(props, req2, 5) // 5 * 100ms = 500ms later + + if output2[0].LatencyMS != 1 { + t.Errorf("Expected cache hit (1ms), got %dms", output2[0].LatencyMS) + } + + // Third request after TTL expiration - cache miss + req3 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + output3, _ := cache.Tick(props, req3, 15) // 15 * 100ms = 1500ms later (expired) + + if output3[0].LatencyMS != 0 { + t.Errorf("Expected cache miss (0ms) after TTL expiration, got %dms", output3[0].LatencyMS) + } +} + +func TestCacheLogic_MaxEntriesEviction(t *testing.T) { + cache := CacheLogic{} + + props := map[string]any{ + "cacheTTL": 10000, + "maxEntries": 2, // Small cache size + "evictionPolicy": "LRU", + } + + // Add first entry + req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req1, 1) + + // Add second entry + req2 := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req2, 2) + + // Verify both are cached + req1Check := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + output1Check, _ := cache.Tick(props, req1Check, 3) + if output1Check[0].LatencyMS != 1 { + t.Errorf("Expected cache hit for req1, got %dms latency", output1Check[0].LatencyMS) + } + + req2Check := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}} + output2Check, _ := cache.Tick(props, req2Check, 4) + if output2Check[0].LatencyMS != 1 { + t.Errorf("Expected cache hit for req2, got %dms latency", output2Check[0].LatencyMS) + } + + // Add third entry (should evict LRU entry) + req3 := []*Request{{ID: "req3", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req3, 5) + + // req1 was accessed at tick 3, req2 at tick 4, so req1 should be evicted + req1CheckAgain := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + output1, _ := cache.Tick(props, req1CheckAgain, 6) + if output1[0].LatencyMS != 0 { + t.Errorf("Expected cache miss for LRU evicted entry, got %dms latency", output1[0].LatencyMS) + } + + // After adding req1 back, the cache should be at capacity with different items + // We don't test further to avoid complex cascading eviction scenarios +} + +func TestCacheLogic_LRUEviction(t *testing.T) { + cache := CacheLogic{} + + props := map[string]any{ + "cacheTTL": 10000, + "maxEntries": 2, + "evictionPolicy": "LRU", + } + + // Add two entries + req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req1, 1) + + req2 := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req2, 2) + + // Access first entry (make it recently used) + req1Access := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req1Access, 3) + + // Add third entry (should evict req2, since req1 was more recently accessed) + req3 := []*Request{{ID: "req3", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req3, 4) + + // Verify that req2 was evicted (should be cache miss) + req2Check := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}} + output2, _ := cache.Tick(props, req2Check, 5) + + if output2[0].LatencyMS != 0 { + t.Errorf("Expected cache miss for LRU evicted entry, got %dms latency", output2[0].LatencyMS) + } + + // After adding req2 back, the cache should contain {req2, req1} or {req2, req3} + // depending on LRU logic. We don't test further to avoid cascading evictions. +} + +func TestCacheLogic_FIFOEviction(t *testing.T) { + cache := CacheLogic{} + + props := map[string]any{ + "cacheTTL": 10000, + "maxEntries": 2, + "evictionPolicy": "FIFO", + } + + // Add two entries + req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req1, 1) + + req2 := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req2, 2) + + // Access first entry multiple times (shouldn't matter for FIFO) + req1Access := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req1Access, 3) + cache.Tick(props, req1Access, 4) + + // Add third entry (should evict req1, the first inserted) + req3 := []*Request{{ID: "req3", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req3, 5) + + // Check that req1 was evicted (first in, first out) + req1Check := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + output1, _ := cache.Tick(props, req1Check, 6) + + if output1[0].LatencyMS != 0 { + t.Errorf("Expected cache miss for FIFO evicted entry, got %dms latency", output1[0].LatencyMS) + } + + // After adding req1 back, the cache should contain {req2, req1} or {req3, req1} + // depending on FIFO logic. We don't test further to avoid cascading evictions. +} + +func TestCacheLogic_DefaultValues(t *testing.T) { + cache := CacheLogic{} + + // Empty props should use defaults + props := map[string]any{} + + req := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + output, _ := cache.Tick(props, req, 1) + + if len(output) != 1 { + t.Errorf("Expected 1 output request") + } + + // Should be cache miss with 0ms latency + if output[0].LatencyMS != 0 { + t.Errorf("Expected 0ms latency for cache miss with defaults, got %dms", output[0].LatencyMS) + } + + // Second request should be cache hit + req2 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + output2, _ := cache.Tick(props, req2, 2) + + if output2[0].LatencyMS != 1 { + t.Errorf("Expected 1ms latency for cache hit, got %dms", output2[0].LatencyMS) + } +} + +func TestCacheLogic_SimpleEviction(t *testing.T) { + cache := CacheLogic{} + + props := map[string]any{ + "cacheTTL": 10000, + "maxEntries": 1, // Only 1 entry allowed + "evictionPolicy": "LRU", + } + + // Add first entry + req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + output1, _ := cache.Tick(props, req1, 1) + if output1[0].LatencyMS != 0 { + t.Errorf("First request should be cache miss, got %dms", output1[0].LatencyMS) + } + + // Check it's cached + req1Again := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + output1Again, _ := cache.Tick(props, req1Again, 2) + if output1Again[0].LatencyMS != 1 { + t.Errorf("Second request should be cache hit, got %dms", output1Again[0].LatencyMS) + } + + // Add second entry (should evict first) + req2 := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}} + output2, _ := cache.Tick(props, req2, 3) + if output2[0].LatencyMS != 0 { + t.Errorf("New request should be cache miss, got %dms", output2[0].LatencyMS) + } + + // Check that first entry is now evicted + req1Final := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + output1Final, _ := cache.Tick(props, req1Final, 4) + if output1Final[0].LatencyMS != 0 { + t.Errorf("Evicted entry should be cache miss, got %dms", output1Final[0].LatencyMS) + } + + // Check that second entry is now also evicted (since req1 was re-added in step 4) + req2Again := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}} + output2Again, _ := cache.Tick(props, req2Again, 5) + if output2Again[0].LatencyMS != 0 { + t.Errorf("Re-evicted entry should be cache miss, got %dms", output2Again[0].LatencyMS) + } +} + +func TestCacheLogic_DifferentRequestTypes(t *testing.T) { + cache := CacheLogic{} + + props := map[string]any{ + "cacheTTL": 10000, + "maxEntries": 100, + "evictionPolicy": "LRU", + } + + // Same ID but different type should be different cache entries + req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + cache.Tick(props, req1, 1) + + req2 := []*Request{{ID: "req1", Type: "POST", LatencyMS: 0}} + output2, _ := cache.Tick(props, req2, 2) + + // Should be cache miss since different type + if output2[0].LatencyMS != 0 { + t.Errorf("Expected cache miss for different request type, got %dms latency", output2[0].LatencyMS) + } + + // Original GET should still be cached + req1Again := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}} + output1, _ := cache.Tick(props, req1Again, 3) + + if output1[0].LatencyMS != 1 { + t.Errorf("Expected cache hit for original request type, got %dms latency", output1[0].LatencyMS) + } +} diff --git a/internal/simulation/cdn.go b/internal/simulation/cdn.go index 02eeeea..ab57685 100644 --- a/internal/simulation/cdn.go +++ b/internal/simulation/cdn.go @@ -5,7 +5,7 @@ type CDNLogic struct{} func (c CDNLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) { // read the ttl for cached content - ttl := int(AsFloat64(props["ttlMs"])) + ttl := int(AsFloat64(props["ttl"])) // retrieve the cdn's cache from props cache, ok := props["_cache"].(map[string]int) diff --git a/internal/simulation/cdn_test.go b/internal/simulation/cdn_test.go index c28987f..a3a83fd 100644 --- a/internal/simulation/cdn_test.go +++ b/internal/simulation/cdn_test.go @@ -9,7 +9,7 @@ func TestCDNLogic(t *testing.T) { cdn := CDNLogic{} cache := map[string]int{} // shared mutable cache props := map[string]any{ - "ttlMs": float64(1000), + "ttl": float64(1000), "_cache": cache, } diff --git a/internal/simulation/database.go b/internal/simulation/database.go new file mode 100644 index 0000000..576f8de --- /dev/null +++ b/internal/simulation/database.go @@ -0,0 +1,61 @@ +package simulation + +type DatabaseLogic struct{} + +func (d DatabaseLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) { + // Extract database properties + replication := int(AsFloat64(props["replication"])) + if replication == 0 { + replication = 1 // default + } + + // Database capacity (could be based on instance size or explicit RPS) + maxRPS := int(AsFloat64(props["maxRPS"])) + if maxRPS == 0 { + maxRPS = 1000 // default capacity + } + + // Base latency for database operations + baseLatencyMs := int(AsFloat64(props["baseLatencyMs"])) + if baseLatencyMs == 0 { + baseLatencyMs = 10 // default 10ms for local DB operations + } + + // Process requests up to capacity + toProcess := queue + if len(queue) > maxRPS { + toProcess = queue[:maxRPS] + // TODO: Could add queue overflow logic here + } + + output := []*Request{} + + for _, req := range toProcess { + // Add database latency to the request + reqCopy := *req + + // Simulate different operation types and their latencies + operationLatency := baseLatencyMs + + // Simple heuristic: reads are faster than writes + if req.Type == "GET" || req.Type == "READ" { + operationLatency = baseLatencyMs + } else if req.Type == "POST" || req.Type == "WRITE" { + operationLatency = baseLatencyMs * 2 // writes take longer + } + + // Replication factor affects write latency + if req.Type == "POST" || req.Type == "WRITE" { + operationLatency += (replication - 1) * 5 // 5ms per replica + } + + reqCopy.LatencyMS += operationLatency + reqCopy.Path = append(reqCopy.Path, "database-processed") + + output = append(output, &reqCopy) + } + + // Database health (could simulate failures, connection issues, etc.) + // For now, assume always healthy + return output, true +} diff --git a/internal/simulation/database_test.go b/internal/simulation/database_test.go new file mode 100644 index 0000000..92d823a --- /dev/null +++ b/internal/simulation/database_test.go @@ -0,0 +1,139 @@ +package simulation + +import ( + "testing" +) + +func TestDatabaseLogic_BasicProcessing(t *testing.T) { + db := DatabaseLogic{} + + props := map[string]any{ + "replication": 2, + "maxRPS": 100, + "baseLatencyMs": 15, + } + + // Create test requests + reqs := []*Request{ + {ID: "req1", Type: "GET", LatencyMS: 0, Path: []string{"start"}}, + {ID: "req2", Type: "POST", LatencyMS: 0, Path: []string{"start"}}, + } + + output, alive := db.Tick(props, reqs, 1) + + if !alive { + t.Errorf("Database should be alive") + } + + if len(output) != 2 { + t.Errorf("Expected 2 output requests, got %d", len(output)) + } + + // Check read latency (base latency) + readReq := output[0] + if readReq.LatencyMS != 15 { + t.Errorf("Expected read latency 15ms, got %dms", readReq.LatencyMS) + } + + // Check write latency (base * 2 + replication penalty) + writeReq := output[1] + expectedWriteLatency := 15*2 + (2-1)*5 // 30 + 5 = 35ms + if writeReq.LatencyMS != expectedWriteLatency { + t.Errorf("Expected write latency %dms, got %dms", expectedWriteLatency, writeReq.LatencyMS) + } +} + +func TestDatabaseLogic_CapacityLimit(t *testing.T) { + db := DatabaseLogic{} + + props := map[string]any{ + "maxRPS": 2, + "baseLatencyMs": 10, + } + + // Create more requests than capacity + reqs := []*Request{ + {ID: "req1", Type: "GET"}, + {ID: "req2", Type: "GET"}, + {ID: "req3", Type: "GET"}, // This should be dropped + } + + output, _ := db.Tick(props, reqs, 1) + + if len(output) != 2 { + t.Errorf("Expected capacity limit of 2, but processed %d requests", len(output)) + } +} + +func TestDatabaseLogic_DefaultValues(t *testing.T) { + db := DatabaseLogic{} + + // Empty props should use defaults + props := map[string]any{} + + reqs := []*Request{ + {ID: "req1", Type: "GET", LatencyMS: 0}, + } + + output, _ := db.Tick(props, reqs, 1) + + if len(output) != 1 { + t.Errorf("Expected 1 output request") + } + + // Should use default 10ms base latency + if output[0].LatencyMS != 10 { + t.Errorf("Expected default latency 10ms, got %dms", output[0].LatencyMS) + } +} + +func TestDatabaseLogic_ReplicationEffect(t *testing.T) { + db := DatabaseLogic{} + + // Test with high replication + props := map[string]any{ + "replication": 5, + "baseLatencyMs": 10, + } + + reqs := []*Request{ + {ID: "req1", Type: "POST", LatencyMS: 0}, + } + + output, _ := db.Tick(props, reqs, 1) + + if len(output) != 1 { + t.Errorf("Expected 1 output request") + } + + // Write latency: base*2 + (replication-1)*5 = 10*2 + (5-1)*5 = 20 + 20 = 40ms + expectedLatency := 10*2 + (5-1)*5 + if output[0].LatencyMS != expectedLatency { + t.Errorf("Expected latency %dms with replication=5, got %dms", expectedLatency, output[0].LatencyMS) + } +} + +func TestDatabaseLogic_ReadVsWrite(t *testing.T) { + db := DatabaseLogic{} + + props := map[string]any{ + "replication": 1, + "baseLatencyMs": 20, + } + + readReq := []*Request{{ID: "read", Type: "GET", LatencyMS: 0}} + writeReq := []*Request{{ID: "write", Type: "POST", LatencyMS: 0}} + + readOutput, _ := db.Tick(props, readReq, 1) + writeOutput, _ := db.Tick(props, writeReq, 1) + + // Read should be base latency + if readOutput[0].LatencyMS != 20 { + t.Errorf("Expected read latency 20ms, got %dms", readOutput[0].LatencyMS) + } + + // Write should be double base latency (no replication penalty with replication=1) + if writeOutput[0].LatencyMS != 40 { + t.Errorf("Expected write latency 40ms, got %dms", writeOutput[0].LatencyMS) + } +} diff --git a/internal/simulation/datapipeline.go b/internal/simulation/datapipeline.go new file mode 100644 index 0000000..8f694cc --- /dev/null +++ b/internal/simulation/datapipeline.go @@ -0,0 +1,203 @@ +package simulation + +type DataPipelineLogic struct{} + +type DataBatch struct { + ID string + RecordCount int + Timestamp int + ProcessingMS int +} + +type PipelineState struct { + ProcessingQueue []DataBatch + CompletedBatches int + TotalRecords int + BacklogSize int +} + +func (d DataPipelineLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) { + // Extract data pipeline properties + batchSize := int(AsFloat64(props["batchSize"])) + if batchSize == 0 { + batchSize = 500 // default batch size + } + + transformation := AsString(props["transformation"]) + if transformation == "" { + transformation = "map" // default transformation + } + + // Get pipeline state from props (persistent state) + state, ok := props["_pipelineState"].(PipelineState) + if !ok { + state = PipelineState{ + ProcessingQueue: []DataBatch{}, + CompletedBatches: 0, + TotalRecords: 0, + BacklogSize: 0, + } + } + + currentTime := tick * 100 // Convert tick to milliseconds + + // Convert incoming requests to data batches + if len(queue) > 0 { + // Group requests into batches + batches := d.createBatches(queue, batchSize, currentTime, transformation) + + // Add batches to processing queue + state.ProcessingQueue = append(state.ProcessingQueue, batches...) + state.BacklogSize += len(queue) + } + + // Process batches that are ready (completed their processing time) + output := []*Request{} + remainingBatches := []DataBatch{} + + for _, batch := range state.ProcessingQueue { + if currentTime >= batch.Timestamp+batch.ProcessingMS { + // Batch is complete - create output requests + for i := 0; i < batch.RecordCount; i++ { + processedReq := &Request{ + ID: batch.ID + "-record-" + string(rune('0'+i)), + Timestamp: batch.Timestamp, + LatencyMS: batch.ProcessingMS, + Origin: "data-pipeline", + Type: "PROCESSED", + Path: []string{"pipeline-" + transformation}, + } + output = append(output, processedReq) + } + + state.CompletedBatches++ + state.TotalRecords += batch.RecordCount + } else { + // Batch still processing + remainingBatches = append(remainingBatches, batch) + } + } + + state.ProcessingQueue = remainingBatches + state.BacklogSize = len(remainingBatches) * batchSize + + // Update persistent state + props["_pipelineState"] = state + + // Health check: pipeline is healthy if backlog is not too large + maxBacklogSize := batchSize * 20 // Allow up to 20 batches in backlog + healthy := state.BacklogSize < maxBacklogSize + + return output, healthy +} + +// createBatches groups requests into batches and calculates processing time +func (d DataPipelineLogic) createBatches(requests []*Request, batchSize int, timestamp int, transformation string) []DataBatch { + batches := []DataBatch{} + + for i := 0; i < len(requests); i += batchSize { + end := i + batchSize + if end > len(requests) { + end = len(requests) + } + + recordCount := end - i + processingTime := d.calculateProcessingTime(recordCount, transformation) + + batch := DataBatch{ + ID: "batch-" + string(rune('A'+len(batches))), + RecordCount: recordCount, + Timestamp: timestamp, + ProcessingMS: processingTime, + } + + batches = append(batches, batch) + } + + return batches +} + +// calculateProcessingTime determines how long a batch takes to process based on transformation type +func (d DataPipelineLogic) calculateProcessingTime(recordCount int, transformation string) int { + // Base processing time per record + baseTimePerRecord := d.getTransformationComplexity(transformation) + + // Total time scales with record count but with some economies of scale + totalTime := float64(recordCount) * baseTimePerRecord + + // Add batch overhead (setup, teardown, I/O) + batchOverhead := d.getBatchOverhead(transformation) + totalTime += batchOverhead + + // Apply economies of scale for larger batches (slightly more efficient) + if recordCount > 100 { + scaleFactor := 0.9 // 10% efficiency gain for large batches + totalTime *= scaleFactor + } + + return int(totalTime) +} + +// getTransformationComplexity returns base processing time per record in milliseconds +func (d DataPipelineLogic) getTransformationComplexity(transformation string) float64 { + switch transformation { + case "map": + return 1.0 // Simple field mapping + case "filter": + return 0.5 // Just evaluate conditions + case "sort": + return 3.0 // Sorting requires more compute + case "aggregate": + return 2.0 // Grouping and calculating aggregates + case "join": + return 5.0 // Most expensive - joining with other datasets + case "deduplicate": + return 2.5 // Hash-based deduplication + case "validate": + return 1.5 // Data validation and cleaning + case "enrich": + return 4.0 // Enriching with external data + case "compress": + return 1.2 // Compression processing + case "encrypt": + return 2.0 // Encryption overhead + default: + return 1.0 // Default to simple transformation + } +} + +// getBatchOverhead returns fixed overhead time per batch in milliseconds +func (d DataPipelineLogic) getBatchOverhead(transformation string) float64 { + switch transformation { + case "map", "filter", "validate": + return 50.0 // Low overhead for simple operations + case "sort", "aggregate", "deduplicate": + return 200.0 // Medium overhead for complex operations + case "join", "enrich": + return 500.0 // High overhead for operations requiring external data + case "compress", "encrypt": + return 100.0 // Medium overhead for I/O operations + default: + return 100.0 // Default overhead + } +} + +// Helper function to get pipeline statistics +func (d DataPipelineLogic) GetPipelineStats(props map[string]any) map[string]interface{} { + state, ok := props["_pipelineState"].(PipelineState) + if !ok { + return map[string]interface{}{ + "completedBatches": 0, + "totalRecords": 0, + "backlogSize": 0, + "queuedBatches": 0, + } + } + + return map[string]interface{}{ + "completedBatches": state.CompletedBatches, + "totalRecords": state.TotalRecords, + "backlogSize": state.BacklogSize, + "queuedBatches": len(state.ProcessingQueue), + } +} diff --git a/internal/simulation/datapipeline_test.go b/internal/simulation/datapipeline_test.go new file mode 100644 index 0000000..b9ebd35 --- /dev/null +++ b/internal/simulation/datapipeline_test.go @@ -0,0 +1,396 @@ +package simulation + +import ( + "testing" +) + +func TestDataPipelineLogic_BasicProcessing(t *testing.T) { + logic := DataPipelineLogic{} + + props := map[string]any{ + "batchSize": 100.0, + "transformation": "map", + } + + // Create 50 requests (less than batch size) + requests := make([]*Request, 50) + for i := range requests { + requests[i] = &Request{ID: string(rune('1' + i)), Type: "DATA", LatencyMS: 0} + } + + // First tick - should create batch and start processing + output, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected data pipeline to be healthy") + } + + // Should not have output yet (batch is still processing) + if len(output) != 0 { + t.Errorf("Expected no output during processing, got %d", len(output)) + } + + // Check that batch was created + state, ok := props["_pipelineState"].(PipelineState) + if !ok { + t.Error("Expected pipeline state to be created") + } + + if len(state.ProcessingQueue) != 1 { + t.Errorf("Expected 1 batch in processing queue, got %d", len(state.ProcessingQueue)) + } + + if state.ProcessingQueue[0].RecordCount != 50 { + t.Errorf("Expected batch with 50 records, got %d", state.ProcessingQueue[0].RecordCount) + } +} + +func TestDataPipelineLogic_BatchCompletion(t *testing.T) { + logic := DataPipelineLogic{} + + props := map[string]any{ + "batchSize": 10.0, + "transformation": "filter", // Fast transformation + } + + // Create 5 requests + requests := make([]*Request, 5) + for i := range requests { + requests[i] = &Request{ID: string(rune('1' + i)), Type: "DATA", LatencyMS: 0} + } + + // First tick - start processing + logic.Tick(props, requests, 1) + + // Wait enough ticks for processing to complete + // Filter transformation should complete quickly + var output []*Request + var healthy bool + + for tick := 2; tick <= 10; tick++ { + output, healthy = logic.Tick(props, []*Request{}, tick) + if len(output) > 0 { + break + } + } + + if !healthy { + t.Error("Expected data pipeline to be healthy") + } + + // Should have output matching input count + if len(output) != 5 { + t.Errorf("Expected 5 output records, got %d", len(output)) + } + + // Check output structure + for _, req := range output { + if req.Type != "PROCESSED" { + t.Errorf("Expected PROCESSED type, got %s", req.Type) + } + if req.Origin != "data-pipeline" { + t.Errorf("Expected data-pipeline origin, got %s", req.Origin) + } + if len(req.Path) == 0 || req.Path[0] != "pipeline-filter" { + t.Error("Expected path to indicate filter transformation") + } + } +} + +func TestDataPipelineLogic_MultipleBatches(t *testing.T) { + logic := DataPipelineLogic{} + + props := map[string]any{ + "batchSize": 10.0, + "transformation": "map", + } + + // Create 25 requests (should create 3 batches: 10, 10, 5) + requests := make([]*Request, 25) + for i := range requests { + requests[i] = &Request{ID: string(rune('1' + i)), Type: "DATA", LatencyMS: 0} + } + + // First tick - create batches + output, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected data pipeline to be healthy") + } + + if len(output) != 0 { + t.Error("Expected no immediate output") + } + + // Check that 3 batches were created + state, ok := props["_pipelineState"].(PipelineState) + if !ok { + t.Error("Expected pipeline state to be created") + } + + if len(state.ProcessingQueue) != 3 { + t.Errorf("Expected 3 batches in processing queue, got %d", len(state.ProcessingQueue)) + } + + // Verify batch sizes + expectedSizes := []int{10, 10, 5} + for i, batch := range state.ProcessingQueue { + if batch.RecordCount != expectedSizes[i] { + t.Errorf("Expected batch %d to have %d records, got %d", + i, expectedSizes[i], batch.RecordCount) + } + } +} + +func TestDataPipelineLogic_TransformationComplexity(t *testing.T) { + logic := DataPipelineLogic{} + + transformations := []string{"filter", "map", "sort", "aggregate", "join"} + + for _, transformation := range transformations { + t.Run(transformation, func(t *testing.T) { + complexity := logic.getTransformationComplexity(transformation) + + // Verify relative complexity ordering + switch transformation { + case "filter": + if complexity >= logic.getTransformationComplexity("map") { + t.Error("Filter should be simpler than map") + } + case "join": + if complexity <= logic.getTransformationComplexity("aggregate") { + t.Error("Join should be more complex than aggregate") + } + case "sort": + if complexity <= logic.getTransformationComplexity("map") { + t.Error("Sort should be more complex than map") + } + } + + if complexity <= 0 { + t.Errorf("Expected positive complexity for %s", transformation) + } + }) + } +} + +func TestDataPipelineLogic_BatchOverhead(t *testing.T) { + logic := DataPipelineLogic{} + + // Test different overhead levels + testCases := []struct { + transformation string + expectedRange [2]float64 // [min, max] + }{ + {"map", [2]float64{0, 100}}, // Low overhead + {"join", [2]float64{300, 600}}, // High overhead + {"sort", [2]float64{150, 300}}, // Medium overhead + } + + for _, tc := range testCases { + overhead := logic.getBatchOverhead(tc.transformation) + + if overhead < tc.expectedRange[0] || overhead > tc.expectedRange[1] { + t.Errorf("Expected %s overhead between %.0f-%.0f, got %.0f", + tc.transformation, tc.expectedRange[0], tc.expectedRange[1], overhead) + } + } +} + +func TestDataPipelineLogic_ProcessingTime(t *testing.T) { + logic := DataPipelineLogic{} + + // Test that processing time scales with record count + smallBatch := logic.calculateProcessingTime(10, "map") + largeBatch := logic.calculateProcessingTime(100, "map") + + if largeBatch <= smallBatch { + t.Error("Expected larger batch to take more time") + } + + // Test that complex transformations take longer + simpleTime := logic.calculateProcessingTime(50, "filter") + complexTime := logic.calculateProcessingTime(50, "join") + + if complexTime <= simpleTime { + t.Error("Expected complex transformation to take longer") + } + + // Test economies of scale (large batches should be more efficient per record) + smallPerRecord := float64(smallBatch) / 10.0 + largePerRecord := float64(largeBatch) / 100.0 + + if largePerRecord >= smallPerRecord { + t.Error("Expected economies of scale for larger batches") + } +} + +func TestDataPipelineLogic_HealthCheck(t *testing.T) { + logic := DataPipelineLogic{} + + props := map[string]any{ + "batchSize": 10.0, + "transformation": "join", // Slow transformation + } + + // Create a large number of requests to test backlog health + requests := make([]*Request, 300) // 30 batches (above healthy threshold) + for i := range requests { + requests[i] = &Request{ID: string(rune('1' + (i % 26))), Type: "DATA", LatencyMS: 0} + } + + // First tick - should create many batches + output, healthy := logic.Tick(props, requests, 1) + + // Should be unhealthy due to large backlog + if healthy { + t.Error("Expected data pipeline to be unhealthy with large backlog") + } + + if len(output) != 0 { + t.Error("Expected no immediate output with slow transformation") + } + + // Check backlog size + state, ok := props["_pipelineState"].(PipelineState) + if !ok { + t.Error("Expected pipeline state to be created") + } + + if state.BacklogSize < 200 { + t.Errorf("Expected large backlog, got %d", state.BacklogSize) + } +} + +func TestDataPipelineLogic_DefaultValues(t *testing.T) { + logic := DataPipelineLogic{} + + // Empty props should use defaults + props := map[string]any{} + + requests := []*Request{{ID: "1", Type: "DATA", LatencyMS: 0}} + + output, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected pipeline to be healthy with default values") + } + + if len(output) != 0 { + t.Error("Expected no immediate output") + } + + // Should use default batch size and transformation + state, ok := props["_pipelineState"].(PipelineState) + if !ok { + t.Error("Expected pipeline state to be created with defaults") + } + + if len(state.ProcessingQueue) != 1 { + t.Error("Expected one batch with default settings") + } +} + +func TestDataPipelineLogic_PipelineStats(t *testing.T) { + logic := DataPipelineLogic{} + + props := map[string]any{ + "batchSize": 5.0, + "transformation": "filter", + } + + // Initial stats should be empty + stats := logic.GetPipelineStats(props) + if stats["completedBatches"] != 0 { + t.Error("Expected initial completed batches to be 0") + } + + // Process some data + requests := make([]*Request, 10) + for i := range requests { + requests[i] = &Request{ID: string(rune('1' + i)), Type: "DATA", LatencyMS: 0} + } + + logic.Tick(props, requests, 1) + + // Check stats after processing + stats = logic.GetPipelineStats(props) + if stats["queuedBatches"] != 2 { + t.Errorf("Expected 2 queued batches, got %v", stats["queuedBatches"]) + } + + if stats["backlogSize"] != 10 { + t.Errorf("Expected backlog size of 10, got %v", stats["backlogSize"]) + } +} + +func TestDataPipelineLogic_ContinuousProcessing(t *testing.T) { + logic := DataPipelineLogic{} + + props := map[string]any{ + "batchSize": 5.0, + "transformation": "map", + } + + // Process multiple waves of data + totalOutput := 0 + + for wave := 0; wave < 3; wave++ { + requests := make([]*Request, 5) + for i := range requests { + requests[i] = &Request{ID: string(rune('A' + wave*5 + i)), Type: "DATA", LatencyMS: 0} + } + + // Process each wave + for tick := wave*10 + 1; tick <= wave*10+5; tick++ { + var output []*Request + if tick == wave*10+1 { + output, _ = logic.Tick(props, requests, tick) + } else { + output, _ = logic.Tick(props, []*Request{}, tick) + } + totalOutput += len(output) + } + } + + // Should have processed all data eventually + if totalOutput != 15 { + t.Errorf("Expected 15 total output records, got %d", totalOutput) + } + + // Check final stats + stats := logic.GetPipelineStats(props) + if stats["totalRecords"] != 15 { + t.Errorf("Expected 15 total records processed, got %v", stats["totalRecords"]) + } +} + +func TestDataPipelineLogic_EmptyQueue(t *testing.T) { + logic := DataPipelineLogic{} + + props := map[string]any{ + "batchSize": 10.0, + "transformation": "map", + } + + // Process empty queue + output, healthy := logic.Tick(props, []*Request{}, 1) + + if !healthy { + t.Error("Expected pipeline to be healthy with empty queue") + } + + if len(output) != 0 { + t.Error("Expected no output with empty queue") + } + + // State should be initialized but empty + state, ok := props["_pipelineState"].(PipelineState) + if !ok { + t.Error("Expected pipeline state to be initialized") + } + + if len(state.ProcessingQueue) != 0 { + t.Error("Expected empty processing queue") + } +} diff --git a/internal/simulation/engine.go b/internal/simulation/engine.go index 5aca079..1653d35 100644 --- a/internal/simulation/engine.go +++ b/internal/simulation/engine.go @@ -185,6 +185,20 @@ func GetLogicForType(t string) NodeLogic { return LoadBalancerLogic{} case "cdn": return CDNLogic{} + case "database": + return DatabaseLogic{} + case "cache": + return CacheLogic{} + case "messageQueue": + return MessageQueueLogic{} + case "microservice": + return MicroserviceLogic{} + case "monitoring/alerting": + return MonitoringLogic{} + case "third party service": + return ThirdPartyServiceLogic{} + case "data pipeline": + return DataPipelineLogic{} default: return nil } diff --git a/internal/simulation/engine_test.go b/internal/simulation/engine_test.go index 5366446..3585df2 100644 --- a/internal/simulation/engine_test.go +++ b/internal/simulation/engine_test.go @@ -1,6 +1,8 @@ package simulation import ( + "encoding/json" + "os" "testing" "systemdesigngame/internal/design" @@ -10,8 +12,8 @@ import ( func TestSimpleChainSimulation(t *testing.T) { d := design.Design{ Nodes: []design.Node{ - {ID: "a", Type: "webserver", Props: map[string]any{"capacityRPS": 1, "baseLatencyMs": 10}}, - {ID: "b", Type: "webserver", Props: map[string]any{"capacityRPS": 1, "baseLatencyMs": 10}}, + {ID: "a", Type: "webserver", Props: map[string]any{"rpsCapacity": 1, "baseLatencyMs": 10}}, + {ID: "b", Type: "webserver", Props: map[string]any{"rpsCapacity": 1, "baseLatencyMs": 10}}, }, Connections: []design.Connection{ {Source: "a", Target: "b"}, @@ -49,8 +51,8 @@ func TestSimpleChainSimulation(t *testing.T) { func TestSingleTickRouting(t *testing.T) { d := design.Design{ Nodes: []design.Node{ - {ID: "a", Type: "webserver", Props: map[string]any{"capacityRPS": 1.0, "baseLatencyMs": 10.0}}, - {ID: "b", Type: "webserver", Props: map[string]any{"capacityRPS": 1.0, "baseLatencyMs": 10.0}}, + {ID: "a", Type: "webserver", Props: map[string]any{"rpsCapacity": 1.0, "baseLatencyMs": 10.0}}, + {ID: "b", Type: "webserver", Props: map[string]any{"rpsCapacity": 1.0, "baseLatencyMs": 10.0}}, }, Connections: []design.Connection{ {Source: "a", Target: "b"}, @@ -85,7 +87,7 @@ func TestSingleTickRouting(t *testing.T) { func TestHighRPSSimulation(t *testing.T) { d := design.Design{ Nodes: []design.Node{ - {ID: "entry", Type: "webserver", Props: map[string]any{"capacityRPS": 5000, "baseLatencyMs": 1}}, + {ID: "entry", Type: "webserver", Props: map[string]any{"rpsCapacity": 5000, "baseLatencyMs": 1}}, }, Connections: []design.Connection{}, } @@ -106,3 +108,854 @@ func TestHighRPSSimulation(t *testing.T) { t.Errorf("expected %d total emitted requests, got %d", expected, totalEmitted) } } + +func TestDatabaseIntegration(t *testing.T) { + design := design.Design{ + Nodes: []design.Node{ + { + ID: "webserver", + Type: "webserver", + Props: map[string]interface{}{ + "rpsCapacity": 10, + }, + }, + { + ID: "database", + Type: "database", + Props: map[string]interface{}{ + "replication": 2, + "maxRPS": 100, + "baseLatencyMs": 20, + }, + }, + }, + Connections: []design.Connection{ + { + Source: "webserver", + Target: "database", + }, + }, + } + + engine := NewEngineFromDesign(design, 100) + engine.RPS = 5 + engine.EntryNode = "webserver" + + snapshots := engine.Run(3, 100) + + if len(snapshots) != 3 { + t.Errorf("Expected 3 snapshots, got %d", len(snapshots)) + } + + // Verify database node exists and is healthy + if len(engine.Nodes) != 2 { + t.Errorf("Expected 2 nodes (webserver + database), got %d", len(engine.Nodes)) + } + + dbNode, exists := engine.Nodes["database"] + if !exists { + t.Errorf("Database node should exist in simulation") + } + + if !dbNode.Alive { + t.Errorf("Database node should be alive") + } + + if dbNode.Type != "database" { + t.Errorf("Expected database type, got %s", dbNode.Type) + } +} + +func TestCacheIntegration(t *testing.T) { + design := design.Design{ + Nodes: []design.Node{ + { + ID: "webserver", + Type: "webserver", + Props: map[string]interface{}{ + "rpsCapacity": 10, + }, + }, + { + ID: "cache", + Type: "cache", + Props: map[string]interface{}{ + "cacheTTL": 5000, + "maxEntries": 50, + "evictionPolicy": "LRU", + }, + }, + { + ID: "database", + Type: "database", + Props: map[string]interface{}{ + "replication": 1, + "maxRPS": 100, + "baseLatencyMs": 15, + }, + }, + }, + Connections: []design.Connection{ + { + Source: "webserver", + Target: "cache", + }, + { + Source: "cache", + Target: "database", + }, + }, + } + + engine := NewEngineFromDesign(design, 100) + engine.RPS = 5 + engine.EntryNode = "webserver" + + snapshots := engine.Run(5, 100) + + if len(snapshots) != 5 { + t.Errorf("Expected 5 snapshots, got %d", len(snapshots)) + } + + // Verify all nodes exist and are healthy + if len(engine.Nodes) != 3 { + t.Errorf("Expected 3 nodes (webserver + cache + database), got %d", len(engine.Nodes)) + } + + cacheNode, exists := engine.Nodes["cache"] + if !exists { + t.Errorf("Cache node should exist in simulation") + } + + if !cacheNode.Alive { + t.Errorf("Cache node should be alive") + } + + if cacheNode.Type != "cache" { + t.Errorf("Expected cache type, got %s", cacheNode.Type) + } + + // Verify cache has internal state + cacheData, ok := cacheNode.Props["_cacheData"] + if !ok { + t.Errorf("Cache should have internal _cacheData state") + } + + // Cache data should be a map + if _, ok := cacheData.(map[string]*CacheEntry); !ok { + t.Errorf("Cache data should be map[string]*CacheEntry") + } +} + +func TestMessageQueueIntegration(t *testing.T) { + design := design.Design{ + Nodes: []design.Node{ + { + ID: "producer", + Type: "webserver", + Props: map[string]interface{}{ + "rpsCapacity": 10, + }, + }, + { + ID: "messagequeue", + Type: "messageQueue", + Props: map[string]interface{}{ + "queueCapacity": 50, + "retentionSeconds": 3600, + "processingRate": 5, + }, + }, + { + ID: "consumer", + Type: "webserver", + Props: map[string]interface{}{ + "rpsCapacity": 20, + }, + }, + }, + Connections: []design.Connection{ + { + Source: "producer", + Target: "messagequeue", + }, + { + Source: "messagequeue", + Target: "consumer", + }, + }, + } + + engine := NewEngineFromDesign(design, 100) + engine.RPS = 3 + engine.EntryNode = "producer" + + snapshots := engine.Run(5, 100) + + if len(snapshots) != 5 { + t.Errorf("Expected 5 snapshots, got %d", len(snapshots)) + } + + // Verify all nodes exist and are healthy + if len(engine.Nodes) != 3 { + t.Errorf("Expected 3 nodes (producer + queue + consumer), got %d", len(engine.Nodes)) + } + + queueNode, exists := engine.Nodes["messagequeue"] + if !exists { + t.Errorf("Message queue node should exist in simulation") + } + + if !queueNode.Alive { + t.Errorf("Message queue node should be alive") + } + + if queueNode.Type != "messageQueue" { + t.Errorf("Expected messageQueue type, got %s", queueNode.Type) + } + + // Verify queue has internal state + messageQueue, ok := queueNode.Props["_messageQueue"] + if !ok { + t.Errorf("Message queue should have internal _messageQueue state") + } + + // Message queue should be a slice + if _, ok := messageQueue.([]QueuedMessage); !ok { + t.Errorf("Message queue should be []QueuedMessage") + } +} + +func TestMicroserviceIntegration(t *testing.T) { + // Load the microservice design + designData, err := os.ReadFile("testdata/microservice_design.json") + if err != nil { + t.Fatalf("Failed to read microservice design: %v", err) + } + + var d design.Design + if err := json.Unmarshal(designData, &d); err != nil { + t.Fatalf("Failed to unmarshal design: %v", err) + } + + // Create engine + engine := NewEngineFromDesign(d, 100) + if engine == nil { + t.Fatalf("Failed to create engine from microservice design") + } + + // Set up simulation parameters + engine.RPS = 30 + engine.EntryNode = "webserver-1" + + // Run simulation for 5 ticks + snapshots := engine.Run(5, 100) + + if len(snapshots) != 5 { + t.Errorf("Expected 5 snapshots, got %d", len(snapshots)) + } + + // Verify microservice nodes exist and are configured correctly + userService, exists := engine.Nodes["microservice-1"] + if !exists { + t.Errorf("User service microservice node should exist") + } + + if !userService.Alive { + t.Errorf("User service should be alive") + } + + if userService.Type != "microservice" { + t.Errorf("Expected microservice type, got %s", userService.Type) + } + + orderService, exists := engine.Nodes["microservice-2"] + if !exists { + t.Errorf("Order service microservice node should exist") + } + + if !orderService.Alive { + t.Errorf("Order service should be alive") + } + + // Verify auto-scaling properties are preserved + userServiceInstanceCount := userService.Props["instanceCount"] + if userServiceInstanceCount == nil { + t.Errorf("User service should have instanceCount property") + } + + // Verify different scaling strategies + userScalingStrategy := userService.Props["scalingStrategy"] + if userScalingStrategy != "auto" { + t.Errorf("Expected auto scaling strategy for user service, got %v", userScalingStrategy) + } + + orderScalingStrategy := orderService.Props["scalingStrategy"] + if orderScalingStrategy != "manual" { + t.Errorf("Expected manual scaling strategy for order service, got %v", orderScalingStrategy) + } + + // Verify resource configurations + userCPU := userService.Props["cpu"] + if userCPU != 4.0 { + t.Errorf("Expected user service to have 4 CPU cores, got %v", userCPU) + } + + orderRAM := orderService.Props["ramGb"] + if orderRAM != 4.0 { + t.Errorf("Expected order service to have 4GB RAM, got %v", orderRAM) + } + + // Check that microservices processed requests through the simulation + lastSnapshot := snapshots[len(snapshots)-1] + if len(lastSnapshot.QueueSizes) == 0 { + t.Errorf("Expected queue sizes to be tracked in snapshots") + } + + // Verify load balancer connected to microservices + loadBalancer, exists := engine.Nodes["lb-1"] + if !exists { + t.Errorf("Load balancer should exist") + } + + if !loadBalancer.Alive { + t.Errorf("Load balancer should be alive") + } + + // Verify database connection exists + database, exists := engine.Nodes["db-1"] + if !exists { + t.Errorf("Database should exist") + } + + if !database.Alive { + t.Errorf("Database should be alive") + } +} + +func TestMonitoringIntegration(t *testing.T) { + // Load the monitoring design + designData, err := os.ReadFile("testdata/monitoring_design.json") + if err != nil { + t.Fatalf("Failed to read monitoring design: %v", err) + } + + var d design.Design + if err := json.Unmarshal(designData, &d); err != nil { + t.Fatalf("Failed to unmarshal design: %v", err) + } + + // Create engine + engine := NewEngineFromDesign(d, 100) + if engine == nil { + t.Fatalf("Failed to create engine from monitoring design") + } + + // Set up simulation parameters + engine.RPS = 20 + engine.EntryNode = "webserver-1" + + // Run simulation for 10 ticks to allow metrics collection + snapshots := engine.Run(10, 100) + + if len(snapshots) != 10 { + t.Errorf("Expected 10 snapshots, got %d", len(snapshots)) + } + + // Verify monitoring nodes exist and are configured correctly + monitor1, exists := engine.Nodes["monitor-1"] + if !exists { + t.Errorf("Latency monitor node should exist") + } + + if !monitor1.Alive { + t.Errorf("Latency monitor should be alive") + } + + if monitor1.Type != "monitoring/alerting" { + t.Errorf("Expected monitoring/alerting type, got %s", monitor1.Type) + } + + monitor2, exists := engine.Nodes["monitor-2"] + if !exists { + t.Errorf("Error rate monitor node should exist") + } + + if !monitor2.Alive { + t.Errorf("Error rate monitor should be alive") + } + + // Verify monitoring properties are preserved + tool1 := monitor1.Props["tool"] + if tool1 != "Prometheus" { + t.Errorf("Expected Prometheus tool for monitor-1, got %v", tool1) + } + + tool2 := monitor2.Props["tool"] + if tool2 != "Datadog" { + t.Errorf("Expected Datadog tool for monitor-2, got %v", tool2) + } + + alertMetric1 := monitor1.Props["alertMetric"] + if alertMetric1 != "latency" { + t.Errorf("Expected latency alert metric for monitor-1, got %v", alertMetric1) + } + + alertMetric2 := monitor2.Props["alertMetric"] + if alertMetric2 != "error_rate" { + t.Errorf("Expected error_rate alert metric for monitor-2, got %v", alertMetric2) + } + + // Check that metrics were collected during simulation + metrics1, ok := monitor1.Props["_metrics"] + if !ok { + t.Errorf("Expected monitor-1 to have collected metrics") + } + + if metrics1 == nil { + t.Errorf("Expected monitor-1 metrics to be non-nil") + } + + // Check alert count tracking + alertCount1, ok := monitor1.Props["_alertCount"] + if !ok { + t.Errorf("Expected monitor-1 to track alert count") + } + + if alertCount1 == nil { + t.Errorf("Expected monitor-1 alert count to be tracked") + } + + // Verify other components in the chain + webserver, exists := engine.Nodes["webserver-1"] + if !exists { + t.Errorf("Web server should exist") + } + + if !webserver.Alive { + t.Errorf("Web server should be alive") + } + + loadBalancer, exists := engine.Nodes["lb-1"] + if !exists { + t.Errorf("Load balancer should exist") + } + + if !loadBalancer.Alive { + t.Errorf("Load balancer should be alive") + } + + // Verify microservices + userService, exists := engine.Nodes["microservice-1"] + if !exists { + t.Errorf("User service should exist") + } + + if !userService.Alive { + t.Errorf("User service should be alive") + } + + orderService, exists := engine.Nodes["microservice-2"] + if !exists { + t.Errorf("Order service should exist") + } + + if !orderService.Alive { + t.Errorf("Order service should be alive") + } + + // Verify database + database, exists := engine.Nodes["db-1"] + if !exists { + t.Errorf("Database should exist") + } + + if !database.Alive { + t.Errorf("Database should be alive") + } + + // Check that requests flowed through the monitoring chain + lastSnapshot := snapshots[len(snapshots)-1] + if len(lastSnapshot.QueueSizes) == 0 { + t.Errorf("Expected queue sizes to be tracked in snapshots") + } + + // Verify monitoring nodes processed requests + if lastSnapshot.NodeHealth["monitor-1"] != true { + t.Errorf("Expected monitor-1 to be healthy in final snapshot") + } + + if lastSnapshot.NodeHealth["monitor-2"] != true { + t.Errorf("Expected monitor-2 to be healthy in final snapshot") + } +} + +func TestThirdPartyServiceIntegration(t *testing.T) { + // Load the third party service design + designData, err := os.ReadFile("testdata/thirdpartyservice_design.json") + if err != nil { + t.Fatalf("Failed to read third party service design: %v", err) + } + + var d design.Design + if err := json.Unmarshal(designData, &d); err != nil { + t.Fatalf("Failed to unmarshal design: %v", err) + } + + // Create engine + engine := NewEngineFromDesign(d, 100) + if engine == nil { + t.Fatalf("Failed to create engine from third party service design") + } + + // Set up simulation parameters + engine.RPS = 10 // Lower RPS to reduce chance of random failures affecting health + engine.EntryNode = "webserver-1" + + // Run simulation for 5 ticks (shorter run to reduce random failure impact) + snapshots := engine.Run(5, 100) + + if len(snapshots) != 5 { + t.Errorf("Expected 5 snapshots, got %d", len(snapshots)) + } + + // Verify third party service nodes exist and are configured correctly + stripeService, exists := engine.Nodes["stripe-service"] + if !exists { + t.Errorf("Stripe service node should exist") + } + + if stripeService.Type != "third party service" { + t.Errorf("Expected third party service type, got %s", stripeService.Type) + } + + twilioService, exists := engine.Nodes["twilio-service"] + if !exists { + t.Errorf("Twilio service node should exist") + } + + sendgridService, exists := engine.Nodes["sendgrid-service"] + if !exists { + t.Errorf("SendGrid service node should exist") + } + + slackService, exists := engine.Nodes["slack-service"] + if !exists { + t.Errorf("Slack service node should exist") + } + + // Note: We don't check if services are alive here because the random failure + // simulation can cause services to go down, which is realistic behavior + + // Verify provider configurations are preserved + stripeProvider := stripeService.Props["provider"] + if stripeProvider != "Stripe" { + t.Errorf("Expected Stripe provider, got %v", stripeProvider) + } + + twilioProvider := twilioService.Props["provider"] + if twilioProvider != "Twilio" { + t.Errorf("Expected Twilio provider, got %v", twilioProvider) + } + + sendgridProvider := sendgridService.Props["provider"] + if sendgridProvider != "SendGrid" { + t.Errorf("Expected SendGrid provider, got %v", sendgridProvider) + } + + slackProvider := slackService.Props["provider"] + if slackProvider != "Slack" { + t.Errorf("Expected Slack provider, got %v", slackProvider) + } + + // Verify latency configurations + stripeLatency := stripeService.Props["latency"] + if stripeLatency != 180.0 { + t.Errorf("Expected Stripe latency 180, got %v", stripeLatency) + } + + twilioLatency := twilioService.Props["latency"] + if twilioLatency != 250.0 { + t.Errorf("Expected Twilio latency 250, got %v", twilioLatency) + } + + // Check that service status was initialized and tracked + stripeStatus, ok := stripeService.Props["_serviceStatus"] + if !ok { + t.Errorf("Expected Stripe service status to be tracked") + } + + if stripeStatus == nil { + t.Errorf("Expected Stripe service status to be non-nil") + } + + // Verify other components in the chain + webserver, exists := engine.Nodes["webserver-1"] + if !exists { + t.Errorf("Web server should exist") + } + + if !webserver.Alive { + t.Errorf("Web server should be alive") + } + + // Verify microservices + paymentService, exists := engine.Nodes["microservice-1"] + if !exists { + t.Errorf("Payment service should exist") + } + + if !paymentService.Alive { + t.Errorf("Payment service should be alive") + } + + notificationService, exists := engine.Nodes["microservice-2"] + if !exists { + t.Errorf("Notification service should exist") + } + + if !notificationService.Alive { + t.Errorf("Notification service should be alive") + } + + // Verify monitoring and database + monitor, exists := engine.Nodes["monitor-1"] + if !exists { + t.Errorf("Monitor should exist") + } + + if !monitor.Alive { + t.Errorf("Monitor should be alive") + } + + database, exists := engine.Nodes["db-1"] + if !exists { + t.Errorf("Database should exist") + } + + if !database.Alive { + t.Errorf("Database should be alive") + } + + // Check that requests flowed through the third party services + lastSnapshot := snapshots[len(snapshots)-1] + if len(lastSnapshot.QueueSizes) == 0 { + t.Errorf("Expected queue sizes to be tracked in snapshots") + } + + // Verify third party services are being tracked in snapshots + // Note: We don't assert health status because random failures are realistic + _, stripeHealthTracked := lastSnapshot.NodeHealth["stripe-service"] + if !stripeHealthTracked { + t.Errorf("Expected Stripe service health to be tracked in snapshots") + } + + _, twilioHealthTracked := lastSnapshot.NodeHealth["twilio-service"] + if !twilioHealthTracked { + t.Errorf("Expected Twilio service health to be tracked in snapshots") + } + + _, sendgridHealthTracked := lastSnapshot.NodeHealth["sendgrid-service"] + if !sendgridHealthTracked { + t.Errorf("Expected SendGrid service health to be tracked in snapshots") + } + + _, slackHealthTracked := lastSnapshot.NodeHealth["slack-service"] + if !slackHealthTracked { + t.Errorf("Expected Slack service health to be tracked in snapshots") + } +} + +func TestDataPipelineIntegration(t *testing.T) { + // Load the data pipeline design + designData, err := os.ReadFile("testdata/datapipeline_design.json") + if err != nil { + t.Fatalf("Failed to read data pipeline design: %v", err) + } + + var d design.Design + if err := json.Unmarshal(designData, &d); err != nil { + t.Fatalf("Failed to unmarshal design: %v", err) + } + + // Create engine + engine := NewEngineFromDesign(d, 100) + if engine == nil { + t.Fatalf("Failed to create engine from data pipeline design") + } + + // Set up simulation parameters + engine.RPS = 20 + engine.EntryNode = "data-source" + + // Run simulation for 10 ticks to test data pipeline processing + snapshots := engine.Run(10, 100) + + if len(snapshots) != 10 { + t.Errorf("Expected 10 snapshots, got %d", len(snapshots)) + } + + // Verify data pipeline nodes exist and are configured correctly + etlPipeline1, exists := engine.Nodes["etl-pipeline-1"] + if !exists { + t.Errorf("ETL Pipeline 1 node should exist") + } + + if etlPipeline1.Type != "data pipeline" { + t.Errorf("Expected data pipeline type, got %s", etlPipeline1.Type) + } + + etlPipeline2, exists := engine.Nodes["etl-pipeline-2"] + if !exists { + t.Errorf("ETL Pipeline 2 node should exist") + } + + mlPipeline, exists := engine.Nodes["ml-pipeline"] + if !exists { + t.Errorf("ML Pipeline node should exist") + } + + analyticsPipeline, exists := engine.Nodes["analytics-pipeline"] + if !exists { + t.Errorf("Analytics Pipeline node should exist") + } + + compressionPipeline, exists := engine.Nodes["compression-pipeline"] + if !exists { + t.Errorf("Compression Pipeline node should exist") + } + + // Verify pipeline configurations are preserved + etl1BatchSize := etlPipeline1.Props["batchSize"] + if etl1BatchSize != 100.0 { + t.Errorf("Expected ETL Pipeline 1 batch size 100, got %v", etl1BatchSize) + } + + etl1Transformation := etlPipeline1.Props["transformation"] + if etl1Transformation != "validate" { + t.Errorf("Expected validate transformation, got %v", etl1Transformation) + } + + etl2BatchSize := etlPipeline2.Props["batchSize"] + if etl2BatchSize != 50.0 { + t.Errorf("Expected ETL Pipeline 2 batch size 50, got %v", etl2BatchSize) + } + + etl2Transformation := etlPipeline2.Props["transformation"] + if etl2Transformation != "aggregate" { + t.Errorf("Expected aggregate transformation, got %v", etl2Transformation) + } + + mlTransformation := mlPipeline.Props["transformation"] + if mlTransformation != "enrich" { + t.Errorf("Expected enrich transformation for ML pipeline, got %v", mlTransformation) + } + + analyticsTransformation := analyticsPipeline.Props["transformation"] + if analyticsTransformation != "join" { + t.Errorf("Expected join transformation for analytics pipeline, got %v", analyticsTransformation) + } + + compressionTransformation := compressionPipeline.Props["transformation"] + if compressionTransformation != "compress" { + t.Errorf("Expected compress transformation, got %v", compressionTransformation) + } + + // Check that pipeline state was initialized and tracked + etl1State, ok := etlPipeline1.Props["_pipelineState"] + if !ok { + t.Errorf("Expected ETL Pipeline 1 to have pipeline state") + } + + if etl1State == nil { + t.Errorf("Expected ETL Pipeline 1 state to be non-nil") + } + + // Verify other components in the data flow + dataSource, exists := engine.Nodes["data-source"] + if !exists { + t.Errorf("Data source should exist") + } + + if !dataSource.Alive { + t.Errorf("Data source should be alive") + } + + rawDataQueue, exists := engine.Nodes["raw-data-queue"] + if !exists { + t.Errorf("Raw data queue should exist") + } + + if !rawDataQueue.Alive { + t.Errorf("Raw data queue should be alive") + } + + // Verify storage components + cache, exists := engine.Nodes["cache-1"] + if !exists { + t.Errorf("Feature cache should exist") + } + + if !cache.Alive { + t.Errorf("Feature cache should be alive") + } + + dataWarehouse, exists := engine.Nodes["data-warehouse"] + if !exists { + t.Errorf("Data warehouse should exist") + } + + if !dataWarehouse.Alive { + t.Errorf("Data warehouse should be alive") + } + + // Verify monitoring + monitor, exists := engine.Nodes["monitoring-1"] + if !exists { + t.Errorf("Pipeline monitor should exist") + } + + if !monitor.Alive { + t.Errorf("Pipeline monitor should be alive") + } + + // Check that data pipelines are being tracked in snapshots + lastSnapshot := snapshots[len(snapshots)-1] + if len(lastSnapshot.QueueSizes) == 0 { + t.Errorf("Expected queue sizes to be tracked in snapshots") + } + + // Verify data pipeline health is tracked + _, etl1HealthTracked := lastSnapshot.NodeHealth["etl-pipeline-1"] + if !etl1HealthTracked { + t.Errorf("Expected ETL Pipeline 1 health to be tracked in snapshots") + } + + _, etl2HealthTracked := lastSnapshot.NodeHealth["etl-pipeline-2"] + if !etl2HealthTracked { + t.Errorf("Expected ETL Pipeline 2 health to be tracked in snapshots") + } + + _, mlHealthTracked := lastSnapshot.NodeHealth["ml-pipeline"] + if !mlHealthTracked { + t.Errorf("Expected ML Pipeline health to be tracked in snapshots") + } + + _, analyticsHealthTracked := lastSnapshot.NodeHealth["analytics-pipeline"] + if !analyticsHealthTracked { + t.Errorf("Expected Analytics Pipeline health to be tracked in snapshots") + } + + _, compressionHealthTracked := lastSnapshot.NodeHealth["compression-pipeline"] + if !compressionHealthTracked { + t.Errorf("Expected Compression Pipeline health to be tracked in snapshots") + } + + // Verify the data flow chain exists (all components are connected) + // This ensures the integration test validates the complete data processing architecture + totalNodes := len(engine.Nodes) + expectedNodes := 10 // From the design JSON + if totalNodes != expectedNodes { + t.Errorf("Expected %d total nodes in data pipeline architecture, got %d", expectedNodes, totalNodes) + } +} diff --git a/internal/simulation/messagequeue.go b/internal/simulation/messagequeue.go new file mode 100644 index 0000000..649ebc5 --- /dev/null +++ b/internal/simulation/messagequeue.go @@ -0,0 +1,115 @@ +package simulation + +type MessageQueueLogic struct{} + +type QueuedMessage struct { + RequestID string + Timestamp int + MessageData string + RetryCount int +} + +func (mq MessageQueueLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) { + // Extract message queue properties + queueCapacity := int(AsFloat64(props["queueCapacity"])) + if queueCapacity == 0 { + queueCapacity = 1000 // default capacity + } + + retentionSeconds := int(AsFloat64(props["retentionSeconds"])) + if retentionSeconds == 0 { + retentionSeconds = 86400 // default 24 hours in seconds + } + + // Processing rate (messages per tick) + processingRate := int(AsFloat64(props["processingRate"])) + if processingRate == 0 { + processingRate = 100 // default 100 messages per tick + } + + // Current timestamp for this tick + currentTime := tick * 100 // assuming 100ms per tick + + // Initialize queue storage in props + messageQueue, ok := props["_messageQueue"].([]QueuedMessage) + if !ok { + messageQueue = []QueuedMessage{} + } + + // Clean up expired messages based on retention policy + messageQueue = mq.cleanExpiredMessages(messageQueue, currentTime, retentionSeconds*1000) + + // First, process existing messages from the queue (FIFO order) + output := []*Request{} + messagesToProcess := len(messageQueue) + if messagesToProcess > processingRate { + messagesToProcess = processingRate + } + + for i := 0; i < messagesToProcess; i++ { + if len(messageQueue) == 0 { + break + } + + // Dequeue message (FIFO - take from front) + message := messageQueue[0] + messageQueue = messageQueue[1:] + + // Create request for downstream processing + processedReq := &Request{ + ID: message.RequestID, + Timestamp: message.Timestamp, + LatencyMS: 2, // Small latency for queue processing + Origin: "message-queue", + Type: "PROCESS", + Path: []string{"queued-message"}, + } + + output = append(output, processedReq) + } + + // Then, add incoming requests to the queue for next tick + for _, req := range queue { + // Check if queue is at capacity + if len(messageQueue) >= queueCapacity { + // Queue full - message is dropped (or could implement backpressure) + // For now, we'll drop the message and add latency penalty + reqCopy := *req + reqCopy.LatencyMS += 1000 // High latency penalty for dropped messages + reqCopy.Path = append(reqCopy.Path, "queue-full-dropped") + // Don't add to output as message was dropped + continue + } + + // Add message to queue + message := QueuedMessage{ + RequestID: req.ID, + Timestamp: currentTime, + MessageData: "message-payload", // In real system, this would be the actual message + RetryCount: 0, + } + messageQueue = append(messageQueue, message) + } + + // Update queue storage in props + props["_messageQueue"] = messageQueue + + // Queue is healthy if not at capacity or if we can still process messages + // Queue becomes unhealthy only when completely full AND we can't process anything + healthy := len(messageQueue) < queueCapacity || processingRate > 0 + + return output, healthy +} + +func (mq MessageQueueLogic) cleanExpiredMessages(messageQueue []QueuedMessage, currentTime, retentionMs int) []QueuedMessage { + cleaned := []QueuedMessage{} + + for _, message := range messageQueue { + if (currentTime - message.Timestamp) <= retentionMs { + cleaned = append(cleaned, message) + } + // Expired messages are dropped + } + + return cleaned +} diff --git a/internal/simulation/messagequeue_test.go b/internal/simulation/messagequeue_test.go new file mode 100644 index 0000000..193f265 --- /dev/null +++ b/internal/simulation/messagequeue_test.go @@ -0,0 +1,329 @@ +package simulation + +import ( + "testing" +) + +func TestMessageQueueLogic_BasicProcessing(t *testing.T) { + mq := MessageQueueLogic{} + + props := map[string]any{ + "queueCapacity": 10, + "retentionSeconds": 3600, // 1 hour + "processingRate": 5, + } + + // Add some messages to the queue + reqs := []*Request{ + {ID: "msg1", Type: "SEND", LatencyMS: 0, Timestamp: 100}, + {ID: "msg2", Type: "SEND", LatencyMS: 0, Timestamp: 100}, + {ID: "msg3", Type: "SEND", LatencyMS: 0, Timestamp: 100}, + } + + output, healthy := mq.Tick(props, reqs, 1) + + if !healthy { + t.Errorf("Message queue should be healthy") + } + + // No immediate output since messages are queued first + if len(output) != 0 { + t.Errorf("Expected 0 immediate output (messages queued), got %d", len(output)) + } + + // Check that messages are in the queue + messageQueue, ok := props["_messageQueue"].([]QueuedMessage) + if !ok { + t.Errorf("Expected message queue to be initialized") + } + + if len(messageQueue) != 3 { + t.Errorf("Expected 3 messages in queue, got %d", len(messageQueue)) + } + + // Process the queue (no new incoming messages) + output2, _ := mq.Tick(props, []*Request{}, 2) + + // Should process up to processingRate (5) messages + if len(output2) != 3 { + t.Errorf("Expected 3 processed messages, got %d", len(output2)) + } + + // Queue should now be empty + messageQueue2, _ := props["_messageQueue"].([]QueuedMessage) + if len(messageQueue2) != 0 { + t.Errorf("Expected empty queue after processing, got %d messages", len(messageQueue2)) + } + + // Check output message properties + for _, msg := range output2 { + if msg.LatencyMS != 2 { + t.Errorf("Expected 2ms processing latency, got %dms", msg.LatencyMS) + } + if msg.Type != "PROCESS" { + t.Errorf("Expected PROCESS type, got %s", msg.Type) + } + } +} + +func TestMessageQueueLogic_CapacityLimit(t *testing.T) { + mq := MessageQueueLogic{} + + props := map[string]any{ + "queueCapacity": 2, // Small capacity + "retentionSeconds": 3600, + "processingRate": 1, + } + + // Add more messages than capacity + reqs := []*Request{ + {ID: "msg1", Type: "SEND", LatencyMS: 0}, + {ID: "msg2", Type: "SEND", LatencyMS: 0}, + {ID: "msg3", Type: "SEND", LatencyMS: 0}, // This should be dropped + } + + output, healthy := mq.Tick(props, reqs, 1) + + // Queue should be healthy (can still process messages) + if !healthy { + t.Errorf("Queue should be healthy (can still process)") + } + + // Should have no immediate output (messages queued) + if len(output) != 0 { + t.Errorf("Expected 0 immediate output, got %d", len(output)) + } + + // Check queue size + messageQueue, _ := props["_messageQueue"].([]QueuedMessage) + if len(messageQueue) != 2 { + t.Errorf("Expected 2 messages in queue (capacity limit), got %d", len(messageQueue)) + } + + // Add another message when queue is full + reqs2 := []*Request{{ID: "msg4", Type: "SEND", LatencyMS: 0}} + output2, healthy2 := mq.Tick(props, reqs2, 2) + + // Queue should still be healthy (can process messages) + if !healthy2 { + t.Errorf("Queue should remain healthy (can still process)") + } + + // Should have 1 processed message (processingRate = 1) + if len(output2) != 1 { + t.Errorf("Expected 1 processed message, got %d", len(output2)) + } + + // Queue should have 2 messages (started with 2, processed 1 leaving 1, added 1 new since space available) + messageQueue2, _ := props["_messageQueue"].([]QueuedMessage) + if len(messageQueue2) != 2 { + t.Errorf("Expected 2 messages in queue (1 remaining + 1 new), got %d", len(messageQueue2)) + } +} + +func TestMessageQueueLogic_ProcessingRate(t *testing.T) { + mq := MessageQueueLogic{} + + props := map[string]any{ + "queueCapacity": 100, + "retentionSeconds": 3600, + "processingRate": 3, // Process 3 messages per tick + } + + // Add 10 messages + reqs := []*Request{} + for i := 0; i < 10; i++ { + reqs = append(reqs, &Request{ID: "msg" + string(rune(i+'0')), Type: "SEND"}) + } + + // First tick: queue all messages + mq.Tick(props, reqs, 1) + + // Second tick: process at rate limit + output, _ := mq.Tick(props, []*Request{}, 2) + + if len(output) != 3 { + t.Errorf("Expected 3 processed messages (rate limit), got %d", len(output)) + } + + // Check remaining queue size + messageQueue, _ := props["_messageQueue"].([]QueuedMessage) + if len(messageQueue) != 7 { + t.Errorf("Expected 7 messages remaining in queue, got %d", len(messageQueue)) + } + + // Third tick: process 3 more + output2, _ := mq.Tick(props, []*Request{}, 3) + + if len(output2) != 3 { + t.Errorf("Expected 3 more processed messages, got %d", len(output2)) + } + + // Check remaining queue size + messageQueue2, _ := props["_messageQueue"].([]QueuedMessage) + if len(messageQueue2) != 4 { + t.Errorf("Expected 4 messages remaining in queue, got %d", len(messageQueue2)) + } +} + +func TestMessageQueueLogic_MessageRetention(t *testing.T) { + mq := MessageQueueLogic{} + + props := map[string]any{ + "queueCapacity": 100, + "retentionSeconds": 1, // 1 second retention + "processingRate": 0, // Don't process messages, just test retention + } + + // Add messages at tick 1 + reqs := []*Request{ + {ID: "msg1", Type: "SEND", Timestamp: 100}, + {ID: "msg2", Type: "SEND", Timestamp: 100}, + } + + mq.Tick(props, reqs, 1) + + // Check messages are queued + messageQueue, _ := props["_messageQueue"].([]QueuedMessage) + if len(messageQueue) != 2 { + t.Errorf("Expected 2 messages in queue, got %d", len(messageQueue)) + } + + // Tick at time that should expire messages (tick 20 = 2000ms, retention = 1000ms) + output, _ := mq.Tick(props, []*Request{}, 20) + + // Messages should be expired and removed + messageQueue2, _ := props["_messageQueue"].([]QueuedMessage) + if len(messageQueue2) != 0 { + t.Errorf("Expected messages to be expired and removed, got %d", len(messageQueue2)) + } + + // No output since processingRate = 0 + if len(output) != 0 { + t.Errorf("Expected no output with processingRate=0, got %d", len(output)) + } +} + +func TestMessageQueueLogic_FIFOOrdering(t *testing.T) { + mq := MessageQueueLogic{} + + props := map[string]any{ + "queueCapacity": 10, + "retentionSeconds": 3600, + "processingRate": 2, + } + + // Add messages in order + reqs := []*Request{ + {ID: "first", Type: "SEND"}, + {ID: "second", Type: "SEND"}, + {ID: "third", Type: "SEND"}, + } + + mq.Tick(props, reqs, 1) + + // Process 2 messages + output, _ := mq.Tick(props, []*Request{}, 2) + + if len(output) != 2 { + t.Errorf("Expected 2 processed messages, got %d", len(output)) + } + + // Check FIFO order + if output[0].ID != "first" { + t.Errorf("Expected first message to be 'first', got '%s'", output[0].ID) + } + + if output[1].ID != "second" { + t.Errorf("Expected second message to be 'second', got '%s'", output[1].ID) + } + + // Process remaining message + output2, _ := mq.Tick(props, []*Request{}, 3) + + if len(output2) != 1 { + t.Errorf("Expected 1 remaining message, got %d", len(output2)) + } + + if output2[0].ID != "third" { + t.Errorf("Expected remaining message to be 'third', got '%s'", output2[0].ID) + } +} + +func TestMessageQueueLogic_DefaultValues(t *testing.T) { + mq := MessageQueueLogic{} + + // Empty props should use defaults + props := map[string]any{} + + reqs := []*Request{{ID: "msg1", Type: "SEND"}} + output, healthy := mq.Tick(props, reqs, 1) + + if !healthy { + t.Errorf("Queue should be healthy with default values") + } + + // Should queue the message (no immediate output) + if len(output) != 0 { + t.Errorf("Expected message to be queued (0 output), got %d", len(output)) + } + + // Check that message was queued with defaults + messageQueue, _ := props["_messageQueue"].([]QueuedMessage) + if len(messageQueue) != 1 { + t.Errorf("Expected 1 message queued with defaults, got %d", len(messageQueue)) + } + + // Process with defaults (should process up to default rate) + output2, _ := mq.Tick(props, []*Request{}, 2) + + if len(output2) != 1 { + t.Errorf("Expected 1 processed message with defaults, got %d", len(output2)) + } +} + +func TestMessageQueueLogic_ContinuousFlow(t *testing.T) { + mq := MessageQueueLogic{} + + props := map[string]any{ + "queueCapacity": 5, + "retentionSeconds": 3600, + "processingRate": 2, + } + + // Tick 1: Add 3 messages + reqs1 := []*Request{ + {ID: "msg1", Type: "SEND"}, + {ID: "msg2", Type: "SEND"}, + {ID: "msg3", Type: "SEND"}, + } + output1, _ := mq.Tick(props, reqs1, 1) + + // Should queue all 3 messages + if len(output1) != 0 { + t.Errorf("Expected 0 output on first tick, got %d", len(output1)) + } + + // Tick 2: Add 2 more messages, process 2 + reqs2 := []*Request{ + {ID: "msg4", Type: "SEND"}, + {ID: "msg5", Type: "SEND"}, + } + output2, _ := mq.Tick(props, reqs2, 2) + + // Should process 2 messages + if len(output2) != 2 { + t.Errorf("Expected 2 processed messages, got %d", len(output2)) + } + + // Should have 3 messages in queue (3 remaining + 2 new - 2 processed) + messageQueue, _ := props["_messageQueue"].([]QueuedMessage) + if len(messageQueue) != 3 { + t.Errorf("Expected 3 messages in queue, got %d", len(messageQueue)) + } + + // Check processing order + if output2[0].ID != "msg1" || output2[1].ID != "msg2" { + t.Errorf("Expected FIFO processing order, got %s, %s", output2[0].ID, output2[1].ID) + } +} diff --git a/internal/simulation/microservice.go b/internal/simulation/microservice.go new file mode 100644 index 0000000..25e55e1 --- /dev/null +++ b/internal/simulation/microservice.go @@ -0,0 +1,162 @@ +package simulation + +import "math" + +type MicroserviceLogic struct{} + +type ServiceInstance struct { + ID int + CurrentLoad int + HealthStatus string +} + +func (m MicroserviceLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) { + // Extract microservice properties + instanceCount := int(AsFloat64(props["instanceCount"])) + if instanceCount == 0 { + instanceCount = 1 // default to 1 instance + } + + cpu := int(AsFloat64(props["cpu"])) + if cpu == 0 { + cpu = 2 // default 2 CPU cores + } + + ramGb := int(AsFloat64(props["ramGb"])) + if ramGb == 0 { + ramGb = 4 // default 4GB RAM + } + + rpsCapacity := int(AsFloat64(props["rpsCapacity"])) + if rpsCapacity == 0 { + rpsCapacity = 100 // default capacity per instance + } + + scalingStrategy := AsString(props["scalingStrategy"]) + if scalingStrategy == "" { + scalingStrategy = "auto" + } + + // Calculate base latency based on resource specs + baseLatencyMs := m.calculateBaseLatency(cpu, ramGb) + + // Auto-scaling logic: adjust instance count based on load + currentLoad := len(queue) + if scalingStrategy == "auto" { + instanceCount = m.autoScale(instanceCount, currentLoad, rpsCapacity) + props["instanceCount"] = float64(instanceCount) // update for next tick + } + + // Total capacity across all instances + totalCapacity := instanceCount * rpsCapacity + + // Process requests up to total capacity + toProcess := queue + if len(queue) > totalCapacity { + toProcess = queue[:totalCapacity] + } + + output := []*Request{} + + // Distribute requests across instances using round-robin + for i, req := range toProcess { + + // Create processed request copy + reqCopy := *req + + // Add microservice processing latency + processingLatency := baseLatencyMs + + // Simulate CPU-bound vs I/O-bound operations + if req.Type == "GET" { + processingLatency = baseLatencyMs // Fast reads + } else if req.Type == "POST" || req.Type == "PUT" { + processingLatency = baseLatencyMs + 10 // Writes take longer + } else if req.Type == "COMPUTE" { + processingLatency = baseLatencyMs + 50 // CPU-intensive operations + } + + // Instance load affects latency (queuing delay) + instanceLoad := m.calculateInstanceLoad(i, len(toProcess), instanceCount) + if float64(instanceLoad) > float64(rpsCapacity)*0.8 { // Above 80% capacity + processingLatency += int(float64(processingLatency) * 0.5) // 50% penalty + } + + reqCopy.LatencyMS += processingLatency + reqCopy.Path = append(reqCopy.Path, "microservice-processed") + + output = append(output, &reqCopy) + } + + // Health check: service is healthy if not severely overloaded + healthy := len(queue) <= totalCapacity*2 // Allow some buffering + + return output, healthy +} + +// calculateBaseLatency determines base processing time based on resources +func (m MicroserviceLogic) calculateBaseLatency(cpu, ramGb int) int { + // Better CPU and RAM = lower base latency + // Formula: base latency inversely proportional to resources + cpuFactor := float64(cpu) + ramFactor := float64(ramGb) / 4.0 // Normalize to 4GB baseline + + resourceScore := cpuFactor * ramFactor + if resourceScore < 1 { + resourceScore = 1 + } + + baseLatency := int(50.0 / resourceScore) // 50ms baseline for 2CPU/4GB + if baseLatency < 5 { + baseLatency = 5 // Minimum 5ms processing time + } + + return baseLatency +} + +// autoScale implements simple auto-scaling logic +func (m MicroserviceLogic) autoScale(currentInstances, currentLoad, rpsPerInstance int) int { + // Calculate desired instances based on current load + desiredInstances := int(math.Ceil(float64(currentLoad) / float64(rpsPerInstance))) + + // Scale up/down gradually (max 25% change per tick) + maxChange := int(math.Max(1, float64(currentInstances)*0.25)) + + if desiredInstances > currentInstances { + // Scale up + newInstances := currentInstances + maxChange + if newInstances > desiredInstances { + newInstances = desiredInstances + } + // Cap at reasonable maximum + if newInstances > 20 { + newInstances = 20 + } + return newInstances + } else if desiredInstances < currentInstances { + // Scale down (more conservative) + newInstances := currentInstances - int(math.Max(1, float64(maxChange)*0.5)) + if newInstances < desiredInstances { + newInstances = desiredInstances + } + // Always maintain at least 1 instance + if newInstances < 1 { + newInstances = 1 + } + return newInstances + } + + return currentInstances +} + +// calculateInstanceLoad estimates load on a specific instance +func (m MicroserviceLogic) calculateInstanceLoad(instanceID, totalRequests, instanceCount int) int { + // Simple round-robin distribution + baseLoad := totalRequests / instanceCount + remainder := totalRequests % instanceCount + + if instanceID < remainder { + return baseLoad + 1 + } + return baseLoad +} diff --git a/internal/simulation/microservice_test.go b/internal/simulation/microservice_test.go new file mode 100644 index 0000000..4373433 --- /dev/null +++ b/internal/simulation/microservice_test.go @@ -0,0 +1,286 @@ +package simulation + +import ( + "testing" +) + +func TestMicroserviceLogic_BasicProcessing(t *testing.T) { + logic := MicroserviceLogic{} + + props := map[string]any{ + "instanceCount": 2.0, + "cpu": 4.0, + "ramGb": 8.0, + "rpsCapacity": 100.0, + "scalingStrategy": "manual", + } + + requests := []*Request{ + {ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}, + {ID: "2", Type: "POST", LatencyMS: 0, Path: []string{}}, + } + + output, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected microservice to be healthy") + } + + if len(output) != 2 { + t.Errorf("Expected 2 processed requests, got %d", len(output)) + } + + // Verify latency was added + for _, req := range output { + if req.LatencyMS == 0 { + t.Error("Expected latency to be added to processed request") + } + if len(req.Path) == 0 || req.Path[len(req.Path)-1] != "microservice-processed" { + t.Error("Expected path to be updated with microservice-processed") + } + } +} + +func TestMicroserviceLogic_CapacityLimit(t *testing.T) { + logic := MicroserviceLogic{} + + props := map[string]any{ + "instanceCount": 1.0, + "rpsCapacity": 2.0, + "scalingStrategy": "manual", + } + + // Send 4 requests, capacity is 2 (1 instance * 2 RPS) + // This should be healthy since 4 <= totalCapacity*2 (4) + requests := make([]*Request, 4) + for i := range requests { + requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0} + } + + output, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected microservice to be healthy with moderate queuing") + } + + // Should only process 2 requests (capacity limit) + if len(output) != 2 { + t.Errorf("Expected 2 processed requests due to capacity limit, got %d", len(output)) + } +} + +func TestMicroserviceLogic_AutoScaling(t *testing.T) { + logic := MicroserviceLogic{} + + props := map[string]any{ + "instanceCount": 1.0, + "rpsCapacity": 10.0, + "scalingStrategy": "auto", + } + + // Send 25 requests to trigger scaling + requests := make([]*Request, 25) + for i := range requests { + requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0} + } + + output, healthy := logic.Tick(props, requests, 1) + + // Check if instances were scaled up + newInstanceCount := int(props["instanceCount"].(float64)) + if newInstanceCount <= 1 { + t.Error("Expected auto-scaling to increase instance count") + } + + // Should process more than 10 requests (original capacity) + if len(output) <= 10 { + t.Errorf("Expected auto-scaling to increase processing capacity, got %d", len(output)) + } + + if !healthy { + t.Error("Expected microservice to be healthy after scaling") + } +} + +func TestMicroserviceLogic_ResourceBasedLatency(t *testing.T) { + logic := MicroserviceLogic{} + + // High-resource microservice + highResourceProps := map[string]any{ + "instanceCount": 1.0, + "cpu": 8.0, + "ramGb": 16.0, + "rpsCapacity": 100.0, + "scalingStrategy": "manual", + } + + // Low-resource microservice + lowResourceProps := map[string]any{ + "instanceCount": 1.0, + "cpu": 1.0, + "ramGb": 1.0, + "rpsCapacity": 100.0, + "scalingStrategy": "manual", + } + + request := []*Request{{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}} + + highOutput, _ := logic.Tick(highResourceProps, request, 1) + lowOutput, _ := logic.Tick(lowResourceProps, request, 1) + + highLatency := highOutput[0].LatencyMS + lowLatency := lowOutput[0].LatencyMS + + if lowLatency <= highLatency { + t.Errorf("Expected low-resource microservice (%dms) to have higher latency than high-resource (%dms)", + lowLatency, highLatency) + } +} + +func TestMicroserviceLogic_RequestTypeLatency(t *testing.T) { + logic := MicroserviceLogic{} + + props := map[string]any{ + "instanceCount": 1.0, + "cpu": 2.0, + "ramGb": 4.0, + "rpsCapacity": 100.0, + "scalingStrategy": "manual", + } + + getRequest := []*Request{{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}} + postRequest := []*Request{{ID: "2", Type: "POST", LatencyMS: 0, Path: []string{}}} + computeRequest := []*Request{{ID: "3", Type: "COMPUTE", LatencyMS: 0, Path: []string{}}} + + getOutput, _ := logic.Tick(props, getRequest, 1) + postOutput, _ := logic.Tick(props, postRequest, 1) + computeOutput, _ := logic.Tick(props, computeRequest, 1) + + getLatency := getOutput[0].LatencyMS + postLatency := postOutput[0].LatencyMS + computeLatency := computeOutput[0].LatencyMS + + if getLatency >= postLatency { + t.Errorf("Expected GET (%dms) to be faster than POST (%dms)", getLatency, postLatency) + } + + if postLatency >= computeLatency { + t.Errorf("Expected POST (%dms) to be faster than COMPUTE (%dms)", postLatency, computeLatency) + } +} + +func TestMicroserviceLogic_HighLoadLatencyPenalty(t *testing.T) { + logic := MicroserviceLogic{} + + props := map[string]any{ + "instanceCount": 1.0, + "cpu": 2.0, + "ramGb": 4.0, + "rpsCapacity": 10.0, + "scalingStrategy": "manual", + } + + // Low load scenario + lowLoadRequest := []*Request{{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}} + lowOutput, _ := logic.Tick(props, lowLoadRequest, 1) + lowLatency := lowOutput[0].LatencyMS + + // High load scenario (above 80% capacity threshold) + highLoadRequests := make([]*Request, 9) // 90% of 10 RPS capacity + for i := range highLoadRequests { + highLoadRequests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0, Path: []string{}} + } + highOutput, _ := logic.Tick(props, highLoadRequests, 1) + + // Check if first request has higher latency due to load + highLatency := highOutput[0].LatencyMS + + if highLatency <= lowLatency { + t.Errorf("Expected high load scenario (%dms) to have higher latency than low load (%dms)", + highLatency, lowLatency) + } +} + +func TestMicroserviceLogic_DefaultValues(t *testing.T) { + logic := MicroserviceLogic{} + + // Empty props should use defaults + props := map[string]any{} + + requests := []*Request{{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}} + + output, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected microservice to be healthy with default values") + } + + if len(output) != 1 { + t.Errorf("Expected 1 processed request with defaults, got %d", len(output)) + } + + // Should have reasonable default latency + if output[0].LatencyMS <= 0 || output[0].LatencyMS > 100 { + t.Errorf("Expected reasonable default latency, got %dms", output[0].LatencyMS) + } +} + +func TestMicroserviceLogic_UnhealthyWhenOverloaded(t *testing.T) { + logic := MicroserviceLogic{} + + props := map[string]any{ + "instanceCount": 1.0, + "rpsCapacity": 5.0, + "scalingStrategy": "manual", // No auto-scaling + } + + // Send way more requests than capacity (5 * 2 = 10 max before unhealthy) + requests := make([]*Request, 15) // 3x capacity + for i := range requests { + requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0} + } + + output, healthy := logic.Tick(props, requests, 1) + + if healthy { + t.Error("Expected microservice to be unhealthy when severely overloaded") + } + + // Should still process up to capacity + if len(output) != 5 { + t.Errorf("Expected 5 processed requests despite being overloaded, got %d", len(output)) + } +} + +func TestMicroserviceLogic_RoundRobinDistribution(t *testing.T) { + logic := MicroserviceLogic{} + + props := map[string]any{ + "instanceCount": 3.0, + "rpsCapacity": 10.0, + "scalingStrategy": "manual", + } + + // Send 6 requests to be distributed across 3 instances + requests := make([]*Request, 6) + for i := range requests { + requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0, Path: []string{}} + } + + output, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected microservice to be healthy") + } + + if len(output) != 6 { + t.Errorf("Expected 6 processed requests, got %d", len(output)) + } + + // All requests should be processed (within total capacity of 30) + for _, req := range output { + if req.LatencyMS <= 0 { + t.Error("Expected all requests to have added latency") + } + } +} diff --git a/internal/simulation/monitoring.go b/internal/simulation/monitoring.go new file mode 100644 index 0000000..ee9c391 --- /dev/null +++ b/internal/simulation/monitoring.go @@ -0,0 +1,221 @@ +package simulation + +type MonitoringLogic struct{} + +type MetricData struct { + Timestamp int + LatencySum int + RequestCount int + ErrorCount int + QueueSize int +} + +type AlertEvent struct { + Timestamp int + MetricType string + Value float64 + Threshold float64 + Unit string + Severity string +} + +func (m MonitoringLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) { + // Extract monitoring properties + tool := AsString(props["tool"]) + if tool == "" { + tool = "Prometheus" // default monitoring tool + } + + alertMetric := AsString(props["alertMetric"]) + if alertMetric == "" { + alertMetric = "latency" // default to latency monitoring + } + + thresholdValue := int(AsFloat64(props["thresholdValue"])) + if thresholdValue == 0 { + thresholdValue = 100 // default threshold + } + + thresholdUnit := AsString(props["thresholdUnit"]) + if thresholdUnit == "" { + thresholdUnit = "ms" // default unit + } + + // Get historical metrics from props + metrics, ok := props["_metrics"].([]MetricData) + if !ok { + metrics = []MetricData{} + } + + // Get alert history + alerts, ok := props["_alerts"].([]AlertEvent) + if !ok { + alerts = []AlertEvent{} + } + + currentTime := tick * 100 // Convert tick to milliseconds + + // Process all incoming requests (monitoring is pass-through) + output := []*Request{} + totalLatency := 0 + errorCount := 0 + + for _, req := range queue { + // Create a copy of the request to forward + reqCopy := *req + + // Add minimal monitoring overhead (1-2ms for metric collection) + monitoringOverhead := 1 + if tool == "Datadog" || tool == "New Relic" { + monitoringOverhead = 2 // More feature-rich tools have slightly higher overhead + } + + reqCopy.LatencyMS += monitoringOverhead + reqCopy.Path = append(reqCopy.Path, "monitored") + + // Collect metrics from the request + totalLatency += req.LatencyMS + + // Simple heuristic: requests with high latency are considered errors + if req.LatencyMS > 1000 { // 1 second threshold for errors + errorCount++ + } + + output = append(output, &reqCopy) + } + + // Calculate current metrics + avgLatency := 0.0 + if len(queue) > 0 { + avgLatency = float64(totalLatency) / float64(len(queue)) + } + + // Store current metrics + currentMetric := MetricData{ + Timestamp: currentTime, + LatencySum: totalLatency, + RequestCount: len(queue), + ErrorCount: errorCount, + QueueSize: len(queue), + } + + // Add to metrics history (keep last 10 data points) + metrics = append(metrics, currentMetric) + if len(metrics) > 10 { + metrics = metrics[1:] + } + + // Check alert conditions + shouldAlert := false + alertValue := 0.0 + + switch alertMetric { + case "latency": + alertValue = avgLatency + if avgLatency > float64(thresholdValue) && len(queue) > 0 { + shouldAlert = true + } + case "throughput": + alertValue = float64(len(queue)) + if len(queue) < thresholdValue { // Low throughput alert + shouldAlert = true + } + case "error_rate": + errorRate := 0.0 + if len(queue) > 0 { + errorRate = float64(errorCount) / float64(len(queue)) * 100 + } + alertValue = errorRate + if errorRate > float64(thresholdValue) { + shouldAlert = true + } + case "queue_size": + alertValue = float64(len(queue)) + if len(queue) > thresholdValue { + shouldAlert = true + } + } + + // Generate alert if threshold exceeded + if shouldAlert { + severity := "warning" + if alertValue > float64(thresholdValue)*1.5 { // 150% of threshold + severity = "critical" + } + + alert := AlertEvent{ + Timestamp: currentTime, + MetricType: alertMetric, + Value: alertValue, + Threshold: float64(thresholdValue), + Unit: thresholdUnit, + Severity: severity, + } + + // Only add alert if it's not a duplicate of the last alert + if len(alerts) == 0 || !m.isDuplicateAlert(alerts[len(alerts)-1], alert) { + alerts = append(alerts, alert) + } + + // Keep only last 20 alerts + if len(alerts) > 20 { + alerts = alerts[1:] + } + } + + // Update props with collected data + props["_metrics"] = metrics + props["_alerts"] = alerts + props["_currentLatency"] = avgLatency + props["_alertCount"] = len(alerts) + + // Monitoring system health - it's healthy unless it's completely overloaded + healthy := len(queue) < 10000 // Can handle very high loads + + // If too many critical alerts recently, mark as unhealthy + recentCriticalAlerts := 0 + for _, alert := range alerts { + if currentTime-alert.Timestamp < 10000 && alert.Severity == "critical" { // Last 10 seconds + recentCriticalAlerts++ + } + } + + if recentCriticalAlerts > 5 { + healthy = false + } + + return output, healthy +} + +// isDuplicateAlert checks if an alert is similar to the previous one to avoid spam +func (m MonitoringLogic) isDuplicateAlert(prev, current AlertEvent) bool { + return prev.MetricType == current.MetricType && + prev.Severity == current.Severity && + (current.Timestamp-prev.Timestamp) < 5000 // Within 5 seconds +} + +// Helper function to calculate moving average +func (m MonitoringLogic) calculateMovingAverage(metrics []MetricData, window int) float64 { + if len(metrics) == 0 { + return 0 + } + + start := 0 + if len(metrics) > window { + start = len(metrics) - window + } + + sum := 0.0 + count := 0 + for i := start; i < len(metrics); i++ { + if metrics[i].RequestCount > 0 { + sum += float64(metrics[i].LatencySum) / float64(metrics[i].RequestCount) + count++ + } + } + + if count == 0 { + return 0 + } + return sum / float64(count) +} diff --git a/internal/simulation/monitoring_test.go b/internal/simulation/monitoring_test.go new file mode 100644 index 0000000..cae87d0 --- /dev/null +++ b/internal/simulation/monitoring_test.go @@ -0,0 +1,411 @@ +package simulation + +import ( + "testing" +) + +func TestMonitoringLogic_BasicPassthrough(t *testing.T) { + logic := MonitoringLogic{} + + props := map[string]any{ + "tool": "Prometheus", + "alertMetric": "latency", + "thresholdValue": 100.0, + "thresholdUnit": "ms", + } + + requests := []*Request{ + {ID: "1", Type: "GET", LatencyMS: 50, Path: []string{}}, + {ID: "2", Type: "POST", LatencyMS: 75, Path: []string{}}, + } + + output, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected monitoring to be healthy") + } + + if len(output) != 2 { + t.Errorf("Expected 2 requests to pass through monitoring, got %d", len(output)) + } + + // Verify minimal latency overhead was added + for i, req := range output { + originalLatency := requests[i].LatencyMS + if req.LatencyMS <= originalLatency { + t.Errorf("Expected monitoring overhead to be added to latency") + } + if req.LatencyMS > originalLatency+5 { + t.Errorf("Expected minimal monitoring overhead, got %d ms added", req.LatencyMS-originalLatency) + } + if len(req.Path) == 0 || req.Path[len(req.Path)-1] != "monitored" { + t.Error("Expected path to be updated with 'monitored'") + } + } +} + +func TestMonitoringLogic_MetricsCollection(t *testing.T) { + logic := MonitoringLogic{} + + props := map[string]any{ + "tool": "Datadog", + "alertMetric": "latency", + "thresholdValue": 100.0, + "thresholdUnit": "ms", + } + + requests := []*Request{ + {ID: "1", Type: "GET", LatencyMS: 50}, + {ID: "2", Type: "POST", LatencyMS: 150}, + {ID: "3", Type: "GET", LatencyMS: 75}, + } + + _, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected monitoring to be healthy") + } + + // Check that metrics were collected + metrics, ok := props["_metrics"].([]MetricData) + if !ok { + t.Error("Expected metrics to be collected in props") + } + + if len(metrics) != 1 { + t.Errorf("Expected 1 metric data point, got %d", len(metrics)) + } + + metric := metrics[0] + if metric.RequestCount != 3 { + t.Errorf("Expected 3 requests counted, got %d", metric.RequestCount) + } + + if metric.LatencySum != 275 { // 50 + 150 + 75 + t.Errorf("Expected latency sum of 275, got %d", metric.LatencySum) + } + + // Check current latency calculation + currentLatency, ok := props["_currentLatency"].(float64) + if !ok { + t.Error("Expected current latency to be calculated") + } + + if currentLatency < 90 || currentLatency > 95 { + t.Errorf("Expected average latency around 91.67, got %f", currentLatency) + } +} + +func TestMonitoringLogic_LatencyAlert(t *testing.T) { + logic := MonitoringLogic{} + + props := map[string]any{ + "tool": "Prometheus", + "alertMetric": "latency", + "thresholdValue": 80.0, + "thresholdUnit": "ms", + } + + // Send requests that exceed latency threshold + requests := []*Request{ + {ID: "1", Type: "GET", LatencyMS: 100}, + {ID: "2", Type: "POST", LatencyMS: 120}, + } + + _, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected monitoring to be healthy despite alerts") + } + + // Check that alert was generated + alerts, ok := props["_alerts"].([]AlertEvent) + if !ok { + t.Error("Expected alerts to be stored in props") + } + + if len(alerts) != 1 { + t.Errorf("Expected 1 alert to be generated, got %d", len(alerts)) + } + + alert := alerts[0] + if alert.MetricType != "latency" { + t.Errorf("Expected latency alert, got %s", alert.MetricType) + } + + if alert.Threshold != 80.0 { + t.Errorf("Expected threshold of 80, got %f", alert.Threshold) + } + + if alert.Value < 80.0 { + t.Errorf("Expected alert value to exceed threshold, got %f", alert.Value) + } + + if alert.Severity != "warning" { + t.Errorf("Expected warning severity, got %s", alert.Severity) + } +} + +func TestMonitoringLogic_ErrorRateAlert(t *testing.T) { + logic := MonitoringLogic{} + + props := map[string]any{ + "tool": "Prometheus", + "alertMetric": "error_rate", + "thresholdValue": 20.0, // 20% error rate threshold + "thresholdUnit": "percent", + } + + // Send mix of normal and high-latency (error) requests + requests := []*Request{ + {ID: "1", Type: "GET", LatencyMS: 100}, // normal + {ID: "2", Type: "POST", LatencyMS: 1200}, // error (>1000ms) + {ID: "3", Type: "GET", LatencyMS: 200}, // normal + {ID: "4", Type: "POST", LatencyMS: 1500}, // error + } + + _, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected monitoring to be healthy") + } + + // Check that error rate alert was generated (50% error rate > 20% threshold) + alerts, ok := props["_alerts"].([]AlertEvent) + if !ok { + t.Error("Expected alerts to be stored in props") + } + + if len(alerts) != 1 { + t.Errorf("Expected 1 alert to be generated, got %d", len(alerts)) + } + + alert := alerts[0] + if alert.MetricType != "error_rate" { + t.Errorf("Expected error_rate alert, got %s", alert.MetricType) + } + + if alert.Value != 50.0 { // 2 errors out of 4 requests = 50% + t.Errorf("Expected 50%% error rate, got %f", alert.Value) + } +} + +func TestMonitoringLogic_QueueSizeAlert(t *testing.T) { + logic := MonitoringLogic{} + + props := map[string]any{ + "tool": "Prometheus", + "alertMetric": "queue_size", + "thresholdValue": 5.0, + "thresholdUnit": "requests", + } + + // Send more requests than threshold + requests := make([]*Request, 8) + for i := range requests { + requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 50} + } + + _, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected monitoring to be healthy with queue size alert") + } + + // Check that queue size alert was generated + alerts, ok := props["_alerts"].([]AlertEvent) + if !ok { + t.Error("Expected alerts to be stored in props") + } + + if len(alerts) != 1 { + t.Errorf("Expected 1 alert to be generated, got %d", len(alerts)) + } + + alert := alerts[0] + if alert.MetricType != "queue_size" { + t.Errorf("Expected queue_size alert, got %s", alert.MetricType) + } + + if alert.Value != 8.0 { + t.Errorf("Expected queue size of 8, got %f", alert.Value) + } +} + +func TestMonitoringLogic_CriticalAlert(t *testing.T) { + logic := MonitoringLogic{} + + props := map[string]any{ + "tool": "Prometheus", + "alertMetric": "latency", + "thresholdValue": 100.0, + "thresholdUnit": "ms", + } + + // Send requests with very high latency (150% of threshold) + requests := []*Request{ + {ID: "1", Type: "GET", LatencyMS: 180}, // 180 > 150 (1.5 * 100) + {ID: "2", Type: "POST", LatencyMS: 200}, + } + + _, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected monitoring to be healthy") + } + + alerts, ok := props["_alerts"].([]AlertEvent) + if !ok { + t.Error("Expected alerts to be stored in props") + } + + if len(alerts) != 1 { + t.Errorf("Expected 1 alert to be generated, got %d", len(alerts)) + } + + alert := alerts[0] + if alert.Severity != "critical" { + t.Errorf("Expected critical severity for high threshold breach, got %s", alert.Severity) + } +} + +func TestMonitoringLogic_DuplicateAlertSuppression(t *testing.T) { + logic := MonitoringLogic{} + + props := map[string]any{ + "tool": "Prometheus", + "alertMetric": "latency", + "thresholdValue": 80.0, + "thresholdUnit": "ms", + } + + requests := []*Request{ + {ID: "1", Type: "GET", LatencyMS: 100}, + } + + // First tick - should generate alert + logic.Tick(props, requests, 1) + + alerts, _ := props["_alerts"].([]AlertEvent) + if len(alerts) != 1 { + t.Errorf("Expected 1 alert after first tick, got %d", len(alerts)) + } + + // Second tick immediately after - should suppress duplicate + logic.Tick(props, requests, 2) + + alerts, _ = props["_alerts"].([]AlertEvent) + if len(alerts) != 1 { + t.Errorf("Expected duplicate alert to be suppressed, got %d alerts", len(alerts)) + } +} + +func TestMonitoringLogic_DefaultValues(t *testing.T) { + logic := MonitoringLogic{} + + // Empty props should use defaults + props := map[string]any{} + + requests := []*Request{{ID: "1", Type: "GET", LatencyMS: 50, Path: []string{}}} + + output, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected monitoring to be healthy with default values") + } + + if len(output) != 1 { + t.Errorf("Expected 1 request to pass through, got %d", len(output)) + } + + // Should have reasonable default monitoring overhead + if output[0].LatencyMS <= 50 || output[0].LatencyMS > 55 { + t.Errorf("Expected default monitoring overhead, got %dms total", output[0].LatencyMS) + } +} + +func TestMonitoringLogic_ToolSpecificOverhead(t *testing.T) { + logic := MonitoringLogic{} + + // Test Prometheus (lower overhead) + propsPrometheus := map[string]any{ + "tool": "Prometheus", + } + + // Test Datadog (higher overhead) + propsDatadog := map[string]any{ + "tool": "Datadog", + } + + request := []*Request{{ID: "1", Type: "GET", LatencyMS: 50, Path: []string{}}} + + prometheusOutput, _ := logic.Tick(propsPrometheus, request, 1) + datadogOutput, _ := logic.Tick(propsDatadog, request, 1) + + prometheusOverhead := prometheusOutput[0].LatencyMS - 50 + datadogOverhead := datadogOutput[0].LatencyMS - 50 + + if datadogOverhead <= prometheusOverhead { + t.Errorf("Expected Datadog (%dms) to have higher overhead than Prometheus (%dms)", + datadogOverhead, prometheusOverhead) + } +} + +func TestMonitoringLogic_UnhealthyWithManyAlerts(t *testing.T) { + logic := MonitoringLogic{} + + props := map[string]any{ + "tool": "Prometheus", + "alertMetric": "latency", + "thresholdValue": 50.0, + "thresholdUnit": "ms", + } + + // Manually create many recent critical alerts to simulate an unhealthy state + currentTime := 10000 // 10 seconds + recentAlerts := []AlertEvent{ + {Timestamp: currentTime - 1000, MetricType: "latency", Severity: "critical", Value: 200}, + {Timestamp: currentTime - 2000, MetricType: "latency", Severity: "critical", Value: 180}, + {Timestamp: currentTime - 3000, MetricType: "latency", Severity: "critical", Value: 190}, + {Timestamp: currentTime - 4000, MetricType: "latency", Severity: "critical", Value: 170}, + {Timestamp: currentTime - 5000, MetricType: "latency", Severity: "critical", Value: 160}, + {Timestamp: currentTime - 6000, MetricType: "latency", Severity: "critical", Value: 150}, + } + + // Set up the props with existing critical alerts + props["_alerts"] = recentAlerts + + // Make a request that would trigger another alert (low latency to avoid triggering new alert) + requests := []*Request{{ID: "1", Type: "GET", LatencyMS: 40}} + + // This tick should recognize the existing critical alerts and mark system as unhealthy + _, healthy := logic.Tick(props, requests, 100) // tick 100 = 10000ms + + if healthy { + t.Error("Expected monitoring to be unhealthy due to many recent critical alerts") + } +} + +func TestMonitoringLogic_MetricsHistoryLimit(t *testing.T) { + logic := MonitoringLogic{} + + props := map[string]any{ + "tool": "Prometheus", + } + + request := []*Request{{ID: "1", Type: "GET", LatencyMS: 50}} + + // Generate more than 10 metric data points + for i := 0; i < 15; i++ { + logic.Tick(props, request, i) + } + + metrics, ok := props["_metrics"].([]MetricData) + if !ok { + t.Error("Expected metrics to be stored") + } + + if len(metrics) != 10 { + t.Errorf("Expected metrics history to be limited to 10, got %d", len(metrics)) + } +} diff --git a/internal/simulation/testdata/cache_design.json b/internal/simulation/testdata/cache_design.json new file mode 100644 index 0000000..c41a1a1 --- /dev/null +++ b/internal/simulation/testdata/cache_design.json @@ -0,0 +1,55 @@ +{ + "nodes": [ + { + "id": "webserver", + "type": "webserver", + "position": { "x": 0, "y": 0 }, + "props": { + "label": "Web Server", + "rpsCapacity": 100 + } + }, + { + "id": "cache", + "type": "cache", + "position": { "x": 100, "y": 0 }, + "props": { + "label": "Redis Cache", + "cacheTTL": 300000, + "maxEntries": 1000, + "evictionPolicy": "LRU" + } + }, + { + "id": "database", + "type": "database", + "position": { "x": 200, "y": 0 }, + "props": { + "label": "Primary DB", + "replication": 2, + "maxRPS": 500, + "baseLatencyMs": 20 + } + } + ], + "connections": [ + { + "source": "webserver", + "target": "cache", + "label": "Cache Lookup", + "direction": "forward", + "protocol": "Redis", + "tls": false, + "capacity": 1000 + }, + { + "source": "cache", + "target": "database", + "label": "Cache Miss", + "direction": "forward", + "protocol": "TCP", + "tls": true, + "capacity": 1000 + } + ] +} diff --git a/internal/simulation/testdata/database_design.json b/internal/simulation/testdata/database_design.json new file mode 100644 index 0000000..20abb71 --- /dev/null +++ b/internal/simulation/testdata/database_design.json @@ -0,0 +1,35 @@ +{ + "nodes": [ + { + "id": "webserver", + "type": "webserver", + "position": { "x": 0, "y": 0 }, + "props": { + "label": "Web Server", + "rpsCapacity": 100 + } + }, + { + "id": "database", + "type": "database", + "position": { "x": 100, "y": 0 }, + "props": { + "label": "Primary DB", + "replication": 2, + "maxRPS": 500, + "baseLatencyMs": 15 + } + } + ], + "connections": [ + { + "source": "webserver", + "target": "database", + "label": "DB Queries", + "direction": "forward", + "protocol": "TCP", + "tls": true, + "capacity": 1000 + } + ] +} diff --git a/internal/simulation/testdata/datapipeline_design.json b/internal/simulation/testdata/datapipeline_design.json new file mode 100644 index 0000000..d0d8a15 --- /dev/null +++ b/internal/simulation/testdata/datapipeline_design.json @@ -0,0 +1,188 @@ +{ + "nodes": [ + { + "id": "data-source", + "type": "webserver", + "position": { "x": 100, "y": 200 }, + "props": { + "label": "Data Ingestion API", + "rpsCapacity": 500 + } + }, + { + "id": "raw-data-queue", + "type": "messageQueue", + "position": { "x": 300, "y": 200 }, + "props": { + "label": "Raw Data Queue", + "queueCapacity": 10000, + "retentionSeconds": 3600, + "processingRate": 200 + } + }, + { + "id": "etl-pipeline-1", + "type": "data pipeline", + "position": { "x": 500, "y": 150 }, + "props": { + "label": "Data Cleansing Pipeline", + "batchSize": 100, + "transformation": "validate" + } + }, + { + "id": "etl-pipeline-2", + "type": "data pipeline", + "position": { "x": 500, "y": 250 }, + "props": { + "label": "Data Transformation Pipeline", + "batchSize": 50, + "transformation": "aggregate" + } + }, + { + "id": "ml-pipeline", + "type": "data pipeline", + "position": { "x": 700, "y": 150 }, + "props": { + "label": "ML Feature Pipeline", + "batchSize": 200, + "transformation": "enrich" + } + }, + { + "id": "analytics-pipeline", + "type": "data pipeline", + "position": { "x": 700, "y": 250 }, + "props": { + "label": "Analytics Pipeline", + "batchSize": 500, + "transformation": "join" + } + }, + { + "id": "cache-1", + "type": "cache", + "position": { "x": 900, "y": 150 }, + "props": { + "label": "Feature Cache", + "cacheTTL": 300, + "maxEntries": 50000, + "evictionPolicy": "LRU" + } + }, + { + "id": "data-warehouse", + "type": "database", + "position": { "x": 900, "y": 250 }, + "props": { + "label": "Data Warehouse", + "replication": 3, + "maxRPS": 1000, + "baseLatencyMs": 50 + } + }, + { + "id": "monitoring-1", + "type": "monitoring/alerting", + "position": { "x": 500, "y": 350 }, + "props": { + "label": "Pipeline Monitor", + "tool": "Datadog", + "alertMetric": "latency", + "thresholdValue": 1000, + "thresholdUnit": "ms" + } + }, + { + "id": "compression-pipeline", + "type": "data pipeline", + "position": { "x": 300, "y": 350 }, + "props": { + "label": "Data Compression", + "batchSize": 1000, + "transformation": "compress" + } + } + ], + "connections": [ + { + "source": "data-source", + "target": "raw-data-queue", + "label": "Raw Data Stream", + "protocol": "http" + }, + { + "source": "raw-data-queue", + "target": "etl-pipeline-1", + "label": "Data Validation", + "protocol": "tcp" + }, + { + "source": "raw-data-queue", + "target": "etl-pipeline-2", + "label": "Data Transformation", + "protocol": "tcp" + }, + { + "source": "etl-pipeline-1", + "target": "ml-pipeline", + "label": "Clean Data", + "protocol": "tcp" + }, + { + "source": "etl-pipeline-2", + "target": "analytics-pipeline", + "label": "Transformed Data", + "protocol": "tcp" + }, + { + "source": "ml-pipeline", + "target": "cache-1", + "label": "ML Features", + "protocol": "tcp" + }, + { + "source": "analytics-pipeline", + "target": "data-warehouse", + "label": "Analytics Data", + "protocol": "tcp" + }, + { + "source": "etl-pipeline-1", + "target": "monitoring-1", + "label": "Pipeline Metrics", + "protocol": "http" + }, + { + "source": "etl-pipeline-2", + "target": "monitoring-1", + "label": "Pipeline Metrics", + "protocol": "http" + }, + { + "source": "ml-pipeline", + "target": "monitoring-1", + "label": "Pipeline Metrics", + "protocol": "http" + }, + { + "source": "analytics-pipeline", + "target": "monitoring-1", + "label": "Pipeline Metrics", + "protocol": "http" + }, + { + "source": "raw-data-queue", + "target": "compression-pipeline", + "label": "Archive Stream", + "protocol": "tcp" + }, + { + "source": "compression-pipeline", + "target": "data-warehouse", + "label": "Compressed Archive", + "protocol": "tcp" + } + ] +} diff --git a/internal/simulation/testdata/messagequeue_design.json b/internal/simulation/testdata/messagequeue_design.json new file mode 100644 index 0000000..3b3c0a7 --- /dev/null +++ b/internal/simulation/testdata/messagequeue_design.json @@ -0,0 +1,53 @@ +{ + "nodes": [ + { + "id": "producer", + "type": "webserver", + "position": { "x": 0, "y": 0 }, + "props": { + "label": "Message Producer", + "rpsCapacity": 50 + } + }, + { + "id": "messagequeue", + "type": "messageQueue", + "position": { "x": 100, "y": 0 }, + "props": { + "label": "Event Queue", + "queueCapacity": 1000, + "retentionSeconds": 3600, + "processingRate": 100 + } + }, + { + "id": "consumer", + "type": "webserver", + "position": { "x": 200, "y": 0 }, + "props": { + "label": "Message Consumer", + "rpsCapacity": 80 + } + } + ], + "connections": [ + { + "source": "producer", + "target": "messagequeue", + "label": "Publish Messages", + "direction": "forward", + "protocol": "AMQP", + "tls": false, + "capacity": 1000 + }, + { + "source": "messagequeue", + "target": "consumer", + "label": "Consume Messages", + "direction": "forward", + "protocol": "AMQP", + "tls": false, + "capacity": 1000 + } + ] +} diff --git a/internal/simulation/testdata/microservice_design.json b/internal/simulation/testdata/microservice_design.json new file mode 100644 index 0000000..dc4c187 --- /dev/null +++ b/internal/simulation/testdata/microservice_design.json @@ -0,0 +1,96 @@ +{ + "nodes": [ + { + "id": "webserver-1", + "type": "webserver", + "position": { "x": 100, "y": 200 }, + "props": { + "label": "API Gateway", + "rpsCapacity": 200 + } + }, + { + "id": "lb-1", + "type": "loadbalancer", + "position": { "x": 300, "y": 200 }, + "props": { + "label": "API Gateway", + "algorithm": "round-robin" + } + }, + { + "id": "microservice-1", + "type": "microservice", + "position": { "x": 500, "y": 150 }, + "props": { + "label": "User Service", + "instanceCount": 3, + "cpu": 4, + "ramGb": 8, + "rpsCapacity": 100, + "monthlyUsd": 150, + "scalingStrategy": "auto", + "apiVersion": "v2" + } + }, + { + "id": "microservice-2", + "type": "microservice", + "position": { "x": 500, "y": 250 }, + "props": { + "label": "Order Service", + "instanceCount": 2, + "cpu": 2, + "ramGb": 4, + "rpsCapacity": 80, + "monthlyUsd": 90, + "scalingStrategy": "manual", + "apiVersion": "v1" + } + }, + { + "id": "db-1", + "type": "database", + "position": { "x": 700, "y": 200 }, + "props": { + "label": "PostgreSQL", + "replication": 2, + "maxRPS": 500, + "baseLatencyMs": 15 + } + } + ], + "connections": [ + { + "source": "webserver-1", + "target": "lb-1", + "label": "HTTPS Requests", + "protocol": "https", + "tls": true + }, + { + "source": "lb-1", + "target": "microservice-1", + "label": "User API", + "protocol": "http" + }, + { + "source": "lb-1", + "target": "microservice-2", + "label": "Order API", + "protocol": "http" + }, + { + "source": "microservice-1", + "target": "db-1", + "label": "User Queries", + "protocol": "tcp" + }, + { + "source": "microservice-2", + "target": "db-1", + "label": "Order Queries", + "protocol": "tcp" + } + ] +} diff --git a/internal/simulation/testdata/monitoring_design.json b/internal/simulation/testdata/monitoring_design.json new file mode 100644 index 0000000..7ec761a --- /dev/null +++ b/internal/simulation/testdata/monitoring_design.json @@ -0,0 +1,127 @@ +{ + "nodes": [ + { + "id": "webserver-1", + "type": "webserver", + "position": { "x": 100, "y": 200 }, + "props": { + "label": "Web Server", + "rpsCapacity": 100 + } + }, + { + "id": "monitor-1", + "type": "monitoring/alerting", + "position": { "x": 300, "y": 200 }, + "props": { + "label": "Prometheus Monitor", + "tool": "Prometheus", + "alertMetric": "latency", + "thresholdValue": 80, + "thresholdUnit": "ms" + } + }, + { + "id": "lb-1", + "type": "loadbalancer", + "position": { "x": 500, "y": 200 }, + "props": { + "label": "Load Balancer", + "algorithm": "round-robin" + } + }, + { + "id": "microservice-1", + "type": "microservice", + "position": { "x": 700, "y": 150 }, + "props": { + "label": "User Service", + "instanceCount": 2, + "cpu": 2, + "ramGb": 4, + "rpsCapacity": 50, + "scalingStrategy": "auto" + } + }, + { + "id": "microservice-2", + "type": "microservice", + "position": { "x": 700, "y": 250 }, + "props": { + "label": "Order Service", + "instanceCount": 1, + "cpu": 1, + "ramGb": 2, + "rpsCapacity": 30, + "scalingStrategy": "manual" + } + }, + { + "id": "monitor-2", + "type": "monitoring/alerting", + "position": { "x": 900, "y": 200 }, + "props": { + "label": "Error Rate Monitor", + "tool": "Datadog", + "alertMetric": "error_rate", + "thresholdValue": 5, + "thresholdUnit": "percent" + } + }, + { + "id": "db-1", + "type": "database", + "position": { "x": 1100, "y": 200 }, + "props": { + "label": "PostgreSQL", + "replication": 2, + "maxRPS": 200, + "baseLatencyMs": 15 + } + } + ], + "connections": [ + { + "source": "webserver-1", + "target": "monitor-1", + "label": "HTTP Requests", + "protocol": "http" + }, + { + "source": "monitor-1", + "target": "lb-1", + "label": "Monitored Requests", + "protocol": "http" + }, + { + "source": "lb-1", + "target": "microservice-1", + "label": "User API", + "protocol": "http" + }, + { + "source": "lb-1", + "target": "microservice-2", + "label": "Order API", + "protocol": "http" + }, + { + "source": "microservice-1", + "target": "monitor-2", + "label": "Service Metrics", + "protocol": "http" + }, + { + "source": "microservice-2", + "target": "monitor-2", + "label": "Service Metrics", + "protocol": "http" + }, + { + "source": "monitor-2", + "target": "db-1", + "label": "Database Queries", + "protocol": "tcp" + } + ] +} diff --git a/internal/simulation/testdata/simple_design.json b/internal/simulation/testdata/simple_design.json index 757c214..d04bf8c 100644 --- a/internal/simulation/testdata/simple_design.json +++ b/internal/simulation/testdata/simple_design.json @@ -16,7 +16,7 @@ "props": { "label": "Web Server", "instanceSize": "medium", - "capacityRPS": 5, + "rpsCapacity": 5, "baseLatencyMs": 50, "penaltyPerRPS": 10 } diff --git a/internal/simulation/testdata/thirdpartyservice_design.json b/internal/simulation/testdata/thirdpartyservice_design.json new file mode 100644 index 0000000..f98cd22 --- /dev/null +++ b/internal/simulation/testdata/thirdpartyservice_design.json @@ -0,0 +1,164 @@ +{ + "nodes": [ + { + "id": "webserver-1", + "type": "webserver", + "position": { "x": 100, "y": 200 }, + "props": { + "label": "E-commerce API", + "rpsCapacity": 200 + } + }, + { + "id": "microservice-1", + "type": "microservice", + "position": { "x": 300, "y": 200 }, + "props": { + "label": "Payment Service", + "instanceCount": 2, + "cpu": 4, + "ramGb": 8, + "rpsCapacity": 100, + "scalingStrategy": "auto" + } + }, + { + "id": "stripe-service", + "type": "third party service", + "position": { "x": 500, "y": 150 }, + "props": { + "label": "Stripe Payments", + "provider": "Stripe", + "latency": 180 + } + }, + { + "id": "twilio-service", + "type": "third party service", + "position": { "x": 500, "y": 250 }, + "props": { + "label": "SMS Notifications", + "provider": "Twilio", + "latency": 250 + } + }, + { + "id": "microservice-2", + "type": "microservice", + "position": { "x": 300, "y": 350 }, + "props": { + "label": "Notification Service", + "instanceCount": 1, + "cpu": 2, + "ramGb": 4, + "rpsCapacity": 50, + "scalingStrategy": "manual" + } + }, + { + "id": "sendgrid-service", + "type": "third party service", + "position": { "x": 500, "y": 350 }, + "props": { + "label": "Email Service", + "provider": "SendGrid", + "latency": 200 + } + }, + { + "id": "slack-service", + "type": "third party service", + "position": { "x": 500, "y": 450 }, + "props": { + "label": "Slack Alerts", + "provider": "Slack", + "latency": 300 + } + }, + { + "id": "monitor-1", + "type": "monitoring/alerting", + "position": { "x": 700, "y": 200 }, + "props": { + "label": "System Monitor", + "tool": "Datadog", + "alertMetric": "latency", + "thresholdValue": 500, + "thresholdUnit": "ms" + } + }, + { + "id": "db-1", + "type": "database", + "position": { "x": 700, "y": 350 }, + "props": { + "label": "Transaction DB", + "replication": 2, + "maxRPS": 300, + "baseLatencyMs": 20 + } + } + ], + "connections": [ + { + "source": "webserver-1", + "target": "microservice-1", + "label": "Payment Requests", + "protocol": "https" + }, + { + "source": "microservice-1", + "target": "stripe-service", + "label": "Process Payment", + "protocol": "https" + }, + { + "source": "microservice-1", + "target": "twilio-service", + "label": "SMS Confirmation", + "protocol": "https" + }, + { + "source": "webserver-1", + "target": "microservice-2", + "label": "Notification Requests", + "protocol": "https" + }, + { + "source": "microservice-2", + "target": "sendgrid-service", + "label": "Send Email", + "protocol": "https" + }, + { + "source": "microservice-2", + "target": "slack-service", + "label": "Admin Alerts", + "protocol": "https" + }, + { + "source": "stripe-service", + "target": "monitor-1", + "label": "Payment Metrics", + "protocol": "http" + }, + { + "source": "twilio-service", + "target": "monitor-1", + "label": "SMS Metrics", + "protocol": "http" + }, + { + "source": "sendgrid-service", + "target": "monitor-1", + "label": "Email Metrics", + "protocol": "http" + }, + { + "source": "monitor-1", + "target": "db-1", + "label": "Store Metrics", + "protocol": "tcp" + } + ] +} diff --git a/internal/simulation/thirdpartyservice.go b/internal/simulation/thirdpartyservice.go new file mode 100644 index 0000000..feee5a0 --- /dev/null +++ b/internal/simulation/thirdpartyservice.go @@ -0,0 +1,219 @@ +package simulation + +import ( + "math/rand" +) + +type ThirdPartyServiceLogic struct{} + +type ServiceStatus struct { + IsUp bool + LastCheck int + FailureCount int + SuccessCount int + RateLimitHits int +} + +func (t ThirdPartyServiceLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) { + // Extract third-party service properties + provider := AsString(props["provider"]) + if provider == "" { + provider = "Generic" // default provider + } + + baseLatency := int(AsFloat64(props["latency"])) + if baseLatency == 0 { + baseLatency = 200 // default 200ms latency + } + + // Get service status from props (persistent state) + status, ok := props["_serviceStatus"].(ServiceStatus) + if !ok { + status = ServiceStatus{ + IsUp: true, + LastCheck: tick, + FailureCount: 0, + SuccessCount: 0, + RateLimitHits: 0, + } + } + + currentTime := tick * 100 // Convert tick to milliseconds + + // Simulate service availability and characteristics based on provider + reliability := t.getProviderReliability(provider) + rateLimitRPS := t.getProviderRateLimit(provider) + latencyVariance := t.getProviderLatencyVariance(provider) + + // Check if service is down and should recover + if !status.IsUp { + // Services typically recover after some time + if currentTime-status.LastCheck > 30000 { // 30 seconds downtime + status.IsUp = true + status.FailureCount = 0 + } + } + + // Apply rate limiting - third-party services often have strict limits + requestsThisTick := len(queue) + if requestsThisTick > rateLimitRPS { + status.RateLimitHits++ + // Only process up to rate limit + queue = queue[:rateLimitRPS] + } + + output := []*Request{} + + for _, req := range queue { + reqCopy := *req + + // Simulate service availability + if !status.IsUp { + // Service is down - simulate timeout/error + reqCopy.LatencyMS += 10000 // 10 second timeout + reqCopy.Path = append(reqCopy.Path, "third-party-timeout") + status.FailureCount++ + } else { + // Service is up - calculate response time + serviceLatency := t.calculateServiceLatency(provider, baseLatency, latencyVariance) + + // Random failure based on reliability + if rand.Float64() > reliability { + // Service call failed + serviceLatency += 5000 // 5 second timeout on failure + reqCopy.Path = append(reqCopy.Path, "third-party-failed") + status.FailureCount++ + + // If too many failures, mark service as down + if status.FailureCount > 5 { + status.IsUp = false + status.LastCheck = currentTime + } + } else { + // Successful service call + reqCopy.Path = append(reqCopy.Path, "third-party-success") + status.SuccessCount++ + + // Reset failure count on successful calls + if status.FailureCount > 0 { + status.FailureCount-- + } + } + + reqCopy.LatencyMS += serviceLatency + } + + output = append(output, &reqCopy) + } + + // Update persistent state + props["_serviceStatus"] = status + + // Health check: service is healthy if external service is up and not excessively rate limited + // Allow some rate limiting but not too much + maxRateLimitHits := 10 // Allow up to 10 rate limit hits before considering unhealthy + healthy := status.IsUp && status.RateLimitHits < maxRateLimitHits + + return output, healthy +} + +// getProviderReliability returns the reliability percentage for different providers +func (t ThirdPartyServiceLogic) getProviderReliability(provider string) float64 { + switch provider { + case "Stripe": + return 0.999 // 99.9% uptime + case "Twilio": + return 0.998 // 99.8% uptime + case "SendGrid": + return 0.997 // 99.7% uptime + case "AWS": + return 0.9995 // 99.95% uptime + case "Google": + return 0.9999 // 99.99% uptime + case "Slack": + return 0.995 // 99.5% uptime + case "GitHub": + return 0.996 // 99.6% uptime + case "Shopify": + return 0.998 // 99.8% uptime + default: + return 0.99 // 99% uptime for generic services + } +} + +// getProviderRateLimit returns the rate limit (requests per tick) for different providers +func (t ThirdPartyServiceLogic) getProviderRateLimit(provider string) int { + switch provider { + case "Stripe": + return 100 // 100 requests per second (per tick in our sim) + case "Twilio": + return 50 // More restrictive + case "SendGrid": + return 200 // Email is typically higher volume + case "AWS": + return 1000 // Very high limits + case "Google": + return 500 // High but controlled + case "Slack": + return 30 // Very restrictive for chat APIs + case "GitHub": + return 60 // GitHub API limits + case "Shopify": + return 80 // E-commerce API limits + default: + return 100 // Default rate limit + } +} + +// getProviderLatencyVariance returns the latency variance factor for different providers +func (t ThirdPartyServiceLogic) getProviderLatencyVariance(provider string) float64 { + switch provider { + case "Stripe": + return 0.3 // Low variance, consistent performance + case "Twilio": + return 0.5 // Moderate variance + case "SendGrid": + return 0.4 // Email services are fairly consistent + case "AWS": + return 0.2 // Very consistent + case "Google": + return 0.25 // Very consistent + case "Slack": + return 0.6 // Chat services can be variable + case "GitHub": + return 0.4 // Moderate variance + case "Shopify": + return 0.5 // E-commerce can be variable under load + default: + return 0.5 // Default variance + } +} + +// calculateServiceLatency computes the actual latency including variance +func (t ThirdPartyServiceLogic) calculateServiceLatency(provider string, baseLatency int, variance float64) int { + // Add random variance to base latency + varianceMs := float64(baseLatency) * variance + randomVariance := (rand.Float64() - 0.5) * 2 * varianceMs // -variance to +variance + + finalLatency := float64(baseLatency) + randomVariance + + // Ensure minimum latency (can't be negative or too low) + if finalLatency < 10 { + finalLatency = 10 + } + + // Add provider-specific baseline adjustments + switch provider { + case "AWS", "Google": + // Cloud providers are typically fast + finalLatency *= 0.8 + case "Slack": + // Chat APIs can be slower + finalLatency *= 1.2 + case "Twilio": + // Telecom APIs have processing overhead + finalLatency *= 1.1 + } + + return int(finalLatency) +} diff --git a/internal/simulation/thirdpartyservice_test.go b/internal/simulation/thirdpartyservice_test.go new file mode 100644 index 0000000..8d71cfa --- /dev/null +++ b/internal/simulation/thirdpartyservice_test.go @@ -0,0 +1,382 @@ +package simulation + +import ( + "testing" +) + +func TestThirdPartyServiceLogic_BasicProcessing(t *testing.T) { + logic := ThirdPartyServiceLogic{} + + props := map[string]any{ + "provider": "Stripe", + "latency": 150.0, + } + + requests := []*Request{ + {ID: "1", Type: "POST", LatencyMS: 50, Path: []string{}}, + {ID: "2", Type: "GET", LatencyMS: 30, Path: []string{}}, + } + + output, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected third party service to be healthy") + } + + if len(output) != 2 { + t.Errorf("Expected 2 processed requests, got %d", len(output)) + } + + // Verify latency was added (should be around base latency with some variance) + for i, req := range output { + originalLatency := requests[i].LatencyMS + if req.LatencyMS <= originalLatency { + t.Errorf("Expected third party service latency to be added") + } + + // Check that path was updated + if len(req.Path) == 0 { + t.Error("Expected path to be updated") + } + + lastPathElement := req.Path[len(req.Path)-1] + if lastPathElement != "third-party-success" && lastPathElement != "third-party-failed" { + t.Errorf("Expected path to indicate success or failure, got %s", lastPathElement) + } + } +} + +func TestThirdPartyServiceLogic_ProviderCharacteristics(t *testing.T) { + logic := ThirdPartyServiceLogic{} + + providers := []string{"Stripe", "AWS", "Slack", "Twilio"} + + for _, provider := range providers { + t.Run(provider, func(t *testing.T) { + props := map[string]any{ + "provider": provider, + "latency": 100.0, + } + + requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}} + + output, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Errorf("Expected %s service to be healthy", provider) + } + + if len(output) != 1 { + t.Errorf("Expected 1 processed request for %s", provider) + } + + // Verify latency characteristics + addedLatency := output[0].LatencyMS + if addedLatency <= 0 { + t.Errorf("Expected %s to add latency", provider) + } + + // AWS and Google should be faster than Slack + if provider == "AWS" && addedLatency > 200 { + t.Errorf("Expected AWS to have lower latency, got %dms", addedLatency) + } + }) + } +} + +func TestThirdPartyServiceLogic_RateLimiting(t *testing.T) { + logic := ThirdPartyServiceLogic{} + + props := map[string]any{ + "provider": "Slack", // Has low rate limit (30 RPS) + "latency": 100.0, + } + + // Send more requests than rate limit + requests := make([]*Request, 50) // More than Slack's 30 RPS limit + for i := range requests { + requests[i] = &Request{ID: string(rune('1' + i)), Type: "POST", LatencyMS: 0} + } + + output, healthy := logic.Tick(props, requests, 1) + + // Should only process up to rate limit + if len(output) != 30 { + t.Errorf("Expected 30 processed requests due to Slack rate limit, got %d", len(output)) + } + + // Service should still be healthy with rate limiting + if !healthy { + t.Error("Expected service to be healthy despite rate limiting") + } + + // Check that rate limit hits were recorded + status, ok := props["_serviceStatus"].(ServiceStatus) + if !ok { + t.Error("Expected service status to be recorded") + } + + if status.RateLimitHits != 1 { + t.Errorf("Expected 1 rate limit hit, got %d", status.RateLimitHits) + } +} + +func TestThirdPartyServiceLogic_ServiceFailure(t *testing.T) { + logic := ThirdPartyServiceLogic{} + + props := map[string]any{ + "provider": "Generic", + "latency": 100.0, + } + + // Set up service as already having failures + status := ServiceStatus{ + IsUp: false, + LastCheck: 0, + FailureCount: 6, + } + props["_serviceStatus"] = status + + requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 50, Path: []string{}}} + + output, healthy := logic.Tick(props, requests, 1) + + if healthy { + t.Error("Expected service to be unhealthy when external service is down") + } + + if len(output) != 1 { + t.Error("Expected request to be processed even when service is down") + } + + // Should have very high latency due to timeout + if output[0].LatencyMS < 5000 { + t.Errorf("Expected high latency for service failure, got %dms", output[0].LatencyMS) + } + + // Check path indicates timeout + lastPath := output[0].Path[len(output[0].Path)-1] + if lastPath != "third-party-timeout" { + t.Errorf("Expected timeout path, got %s", lastPath) + } +} + +func TestThirdPartyServiceLogic_ServiceRecovery(t *testing.T) { + logic := ThirdPartyServiceLogic{} + + props := map[string]any{ + "provider": "Stripe", + "latency": 100.0, + } + + // Set up service as down but with old timestamp (should recover) + status := ServiceStatus{ + IsUp: false, + LastCheck: 0, // Very old timestamp + FailureCount: 3, + } + props["_serviceStatus"] = status + + requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 50, Path: []string{}}} + + // Run with current tick that's more than 30 seconds later + _, healthy := logic.Tick(props, requests, 400) // 40 seconds later + + if !healthy { + t.Error("Expected service to be healthy after recovery") + } + + // Check that service recovered + updatedStatus, ok := props["_serviceStatus"].(ServiceStatus) + if !ok { + t.Error("Expected updated service status") + } + + if !updatedStatus.IsUp { + t.Error("Expected service to have recovered") + } + + if updatedStatus.FailureCount != 0 { + t.Error("Expected failure count to be reset on recovery") + } +} + +func TestThirdPartyServiceLogic_ReliabilityDifferences(t *testing.T) { + logic := ThirdPartyServiceLogic{} + + // Test different reliability levels + testCases := []struct { + provider string + expectedReliability float64 + }{ + {"AWS", 0.9995}, + {"Google", 0.9999}, + {"Stripe", 0.999}, + {"Slack", 0.995}, + {"Generic", 0.99}, + } + + for _, tc := range testCases { + reliability := logic.getProviderReliability(tc.provider) + if reliability != tc.expectedReliability { + t.Errorf("Expected %s reliability %.4f, got %.4f", + tc.provider, tc.expectedReliability, reliability) + } + } +} + +func TestThirdPartyServiceLogic_RateLimitDifferences(t *testing.T) { + logic := ThirdPartyServiceLogic{} + + // Test different rate limits + testCases := []struct { + provider string + expectedLimit int + }{ + {"AWS", 1000}, + {"Stripe", 100}, + {"Slack", 30}, + {"SendGrid", 200}, + {"Twilio", 50}, + } + + for _, tc := range testCases { + rateLimit := logic.getProviderRateLimit(tc.provider) + if rateLimit != tc.expectedLimit { + t.Errorf("Expected %s rate limit %d, got %d", + tc.provider, tc.expectedLimit, rateLimit) + } + } +} + +func TestThirdPartyServiceLogic_LatencyVariance(t *testing.T) { + logic := ThirdPartyServiceLogic{} + + props := map[string]any{ + "provider": "Stripe", + "latency": 100.0, + } + + requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}} + + latencies := []int{} + + // Run multiple times to observe variance + for i := 0; i < 10; i++ { + output, _ := logic.Tick(props, requests, i) + latencies = append(latencies, output[0].LatencyMS) + } + + // Check that we have variance (not all latencies are the same) + allSame := true + firstLatency := latencies[0] + for _, latency := range latencies[1:] { + if latency != firstLatency { + allSame = false + break + } + } + + if allSame { + t.Error("Expected latency variance, but all latencies were the same") + } + + // All latencies should be reasonable (between 50ms and 300ms for Stripe) + for _, latency := range latencies { + if latency < 50 || latency > 300 { + t.Errorf("Expected reasonable latency for Stripe, got %dms", latency) + } + } +} + +func TestThirdPartyServiceLogic_DefaultValues(t *testing.T) { + logic := ThirdPartyServiceLogic{} + + // Empty props should use defaults + props := map[string]any{} + + requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}} + + output, healthy := logic.Tick(props, requests, 1) + + if !healthy { + t.Error("Expected service to be healthy with default values") + } + + if len(output) != 1 { + t.Error("Expected 1 processed request with defaults") + } + + // Should have reasonable default latency (around 200ms base) + if output[0].LatencyMS < 100 || output[0].LatencyMS > 400 { + t.Errorf("Expected reasonable default latency, got %dms", output[0].LatencyMS) + } +} + +func TestThirdPartyServiceLogic_SuccessCountTracking(t *testing.T) { + logic := ThirdPartyServiceLogic{} + + props := map[string]any{ + "provider": "AWS", // High reliability + "latency": 50.0, + } + + requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}} + + // Run multiple successful requests + for i := 0; i < 5; i++ { + logic.Tick(props, requests, i) + } + + status, ok := props["_serviceStatus"].(ServiceStatus) + if !ok { + t.Error("Expected service status to be tracked") + } + + // Should have accumulated success count + if status.SuccessCount == 0 { + t.Error("Expected success count to be tracked") + } + + // Should be healthy + if !status.IsUp { + t.Error("Expected service to remain up with successful calls") + } +} + +func TestThirdPartyServiceLogic_FailureRecovery(t *testing.T) { + logic := ThirdPartyServiceLogic{} + + props := map[string]any{ + "provider": "Generic", + "latency": 100.0, + } + + // Set up service with some failures but still up + status := ServiceStatus{ + IsUp: true, + FailureCount: 3, + SuccessCount: 0, + } + props["_serviceStatus"] = status + + requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}} + + // Simulate a successful call (with high probability for Generic service) + // We'll run this multiple times to ensure we get at least one success + successFound := false + for i := 0; i < 10 && !successFound; i++ { + output, _ := logic.Tick(props, requests, i) + if len(output[0].Path) > 0 && output[0].Path[len(output[0].Path)-1] == "third-party-success" { + successFound = true + } + } + + if successFound { + updatedStatus, _ := props["_serviceStatus"].(ServiceStatus) + // Failure count should have decreased + if updatedStatus.FailureCount >= 3 { + t.Error("Expected failure count to decrease after successful call") + } + } +} diff --git a/internal/simulation/webserver.go b/internal/simulation/webserver.go index c3a2d57..cc7747d 100644 --- a/internal/simulation/webserver.go +++ b/internal/simulation/webserver.go @@ -6,7 +6,7 @@ type WebServerLogic struct { } func (l WebServerLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) { - maxRPS := int(AsFloat64(props["capacityRPS"])) + maxRPS := int(AsFloat64(props["rpsCapacity"])) toProcess := queue if len(queue) > maxRPS { diff --git a/router/handlers/simulation.go b/router/handlers/simulation.go index 8d0ffac..d5bce73 100644 --- a/router/handlers/simulation.go +++ b/router/handlers/simulation.go @@ -2,17 +2,24 @@ package handlers import ( "encoding/json" + "fmt" "net/http" "systemdesigngame/internal/design" + "systemdesigngame/internal/level" + "systemdesigngame/internal/simulation" ) type SimulationHandler struct{} type SimulationResponse struct { - Success bool `json:"success"` - Metrics map[string]interface{} `json:"metrics,omitempty"` - Timeline []interface{} `json:"timeline,omitempty"` - Error string `json:"error,omitempty"` + Success bool `json:"success"` + Metrics map[string]interface{} `json:"metrics,omitempty"` + Timeline []interface{} `json:"timeline,omitempty"` + Passed bool `json:"passed,omitempty"` + Score int `json:"score,omitempty"` + Feedback []string `json:"feedback,omitempty"` + LevelName string `json:"levelName,omitempty"` + Error string `json:"error,omitempty"` } func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { @@ -21,22 +28,96 @@ func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { return } - var design design.Design - if err := json.NewDecoder(r.Body).Decode(&design); err != nil { - http.Error(w, "Invalid design JSON: "+err.Error(), http.StatusBadRequest) + var requestBody struct { + Design design.Design `json:"design"` + LevelName string `json:"levelName,omitempty"` + Difficulty string `json:"difficulty,omitempty"` + } + + if err := json.NewDecoder(r.Body).Decode(&requestBody); err != nil { + // Try to decode as just design for backward compatibility + r.Body.Close() + var design design.Design + if err2 := json.NewDecoder(r.Body).Decode(&design); err2 != nil { + http.Error(w, "Invalid request JSON: "+err.Error(), http.StatusBadRequest) + return + } + requestBody.Design = design + } + + // Extract the design for processing + design := requestBody.Design + + // Run the actual simulation + engine := simulation.NewEngineFromDesign(design, 100) + if engine == nil { + response := SimulationResponse{ + Success: false, + Error: "Failed to create simulation engine - no valid components found", + } + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(response) return } - // For now, return a mock successful response but eventually, we want to go to the results page(s) + // Set simulation parameters + engine.RPS = 50 // Default RPS - could be configurable later + + // Find entry node by analyzing topology + entryNode := findEntryNode(design) + + if entryNode == "" { + response := SimulationResponse{ + Success: false, + Error: "No entry point found - design must include a component with no incoming connections (webserver, microservice, load balancer, etc.)", + } + w.Header().Set("Content-Type", "application/json") + json.NewEncoder(w).Encode(response) + return + } + + engine.EntryNode = entryNode + + // Run simulation for 60 ticks (6 seconds at 100ms per tick) + snapshots := engine.Run(60, 100) + + // Calculate metrics from snapshots + metrics := calculateMetrics(snapshots) + + // Convert snapshots to interface{} for JSON serialization + timeline := make([]interface{}, len(snapshots)) + for i, snapshot := range snapshots { + timeline[i] = snapshot + } + + // Perform level validation if level info provided + var passed bool + var score int + var feedback []string + var levelName string + + if requestBody.LevelName != "" { + difficulty := level.DifficultyEasy // default + if requestBody.Difficulty != "" { + difficulty = level.Difficulty(requestBody.Difficulty) + } + + if lvl, err := level.GetLevel(requestBody.LevelName, difficulty); err == nil { + levelName = lvl.Name + passed, score, feedback = validateLevel(lvl, design, metrics) + } else { + feedback = []string{"Warning: Level not found, simulation ran without validation"} + } + } + response := SimulationResponse{ - Success: true, - Metrics: map[string]interface{}{ - "throughput": 250, - "latency_p95": 85, - "cost_monthly": 120, - "availability": 99.5, - }, - Timeline: []interface{}{}, // Will contain TickSnapshots later + Success: true, + Metrics: metrics, + Timeline: timeline, + Passed: passed, + Score: score, + Feedback: feedback, + LevelName: levelName, } w.Header().Set("Content-Type", "application/json") @@ -45,3 +126,312 @@ func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { return } } + +// calculateMetrics computes key performance metrics from simulation snapshots +func calculateMetrics(snapshots []*simulation.TickSnapshot) map[string]interface{} { + if len(snapshots) == 0 { + return map[string]interface{}{ + "throughput": 0, + "latency_avg": 0, + "cost_monthly": 0, + "availability": 0, + } + } + + totalRequests := 0 + totalLatency := 0 + totalHealthy := 0 + totalNodes := 0 + + // Calculate aggregate metrics across all snapshots + for _, snapshot := range snapshots { + // Count total requests processed in this tick + for _, requests := range snapshot.Emitted { + totalRequests += len(requests) + for _, req := range requests { + totalLatency += req.LatencyMS + } + } + + // Count healthy vs total nodes + for _, healthy := range snapshot.NodeHealth { + totalNodes++ + if healthy { + totalHealthy++ + } + } + } + + // Calculate throughput (requests per second) + // snapshots represent 6 seconds of simulation (60 ticks * 100ms) + simulationSeconds := float64(len(snapshots)) * 0.1 // 100ms per tick + throughput := float64(totalRequests) / simulationSeconds + + // Calculate average latency + avgLatency := 0.0 + if totalRequests > 0 { + avgLatency = float64(totalLatency) / float64(totalRequests) + } + + // Calculate availability percentage + availability := 0.0 + if totalNodes > 0 { + availability = (float64(totalHealthy) / float64(totalNodes)) * 100 + } + + // Estimate monthly cost (placeholder - could be enhanced) + monthlyCost := float64(totalNodes) * 50 // $50 per node per month baseline + + return map[string]interface{}{ + "throughput": int(throughput), + "latency_avg": int(avgLatency), + "cost_monthly": int(monthlyCost), + "availability": availability, + } +} + +// findEntryNode analyzes the design topology to find the best entry point +func findEntryNode(design design.Design) string { + // Build map of incoming connections + incomingCount := make(map[string]int) + + // Initialize all nodes with 0 incoming connections + for _, node := range design.Nodes { + incomingCount[node.ID] = 0 + } + + // Count incoming connections for each node + for _, conn := range design.Connections { + incomingCount[conn.Target]++ + } + + // Find nodes with no incoming connections (potential entry points) + var entryPoints []string + for nodeID, count := range incomingCount { + if count == 0 { + entryPoints = append(entryPoints, nodeID) + } + } + + // If multiple entry points exist, prefer certain types + if len(entryPoints) > 1 { + return preferredEntryPoint(design.Nodes, entryPoints) + } else if len(entryPoints) == 1 { + return entryPoints[0] + } + + return "" // No entry point found +} + +// preferredEntryPoint selects the best entry point from candidates based on component type +func preferredEntryPoint(nodes []design.Node, candidateIDs []string) string { + // Priority order for entry points (most logical first) + priority := []string{ + "webserver", + "microservice", + "loadBalancer", // Could be edge load balancer + "cdn", // Edge CDN + "data pipeline", // Data ingestion entry + "messageQueue", // For event-driven architectures + } + + // Create lookup for candidate nodes + candidates := make(map[string]design.Node) + for _, node := range nodes { + for _, id := range candidateIDs { + if node.ID == id { + candidates[id] = node + break + } + } + } + + // Find highest priority candidate + for _, nodeType := range priority { + for id, node := range candidates { + if node.Type == nodeType { + return id + } + } + } + + // If no preferred type, return first candidate + if len(candidateIDs) > 0 { + return candidateIDs[0] + } + + return "" +} + +// validateLevel checks if the design and simulation results meet level requirements +func validateLevel(lvl *level.Level, design design.Design, metrics map[string]interface{}) (bool, int, []string) { + var feedback []string + var failedRequirements []string + var passedRequirements []string + + // Extract metrics + throughput := metrics["throughput"].(int) + avgLatency := metrics["latency_avg"].(int) + availability := metrics["availability"].(float64) + monthlyCost := metrics["cost_monthly"].(int) + + // Check throughput requirement + if throughput >= lvl.TargetRPS { + passedRequirements = append(passedRequirements, "Throughput requirement met") + } else { + failedRequirements = append(failedRequirements, + fmt.Sprintf("Throughput: %d RPS (required: %d RPS)", throughput, lvl.TargetRPS)) + } + + // Check latency requirement (using avg latency as approximation for P95) + if avgLatency <= lvl.MaxP95LatencyMs { + passedRequirements = append(passedRequirements, "Latency requirement met") + } else { + failedRequirements = append(failedRequirements, + fmt.Sprintf("Latency: %dms (max allowed: %dms)", avgLatency, lvl.MaxP95LatencyMs)) + } + + // Check availability requirement + if availability >= lvl.RequiredAvailabilityPct { + passedRequirements = append(passedRequirements, "Availability requirement met") + } else { + failedRequirements = append(failedRequirements, + fmt.Sprintf("Availability: %.1f%% (required: %.1f%%)", availability, lvl.RequiredAvailabilityPct)) + } + + // Check cost requirement + if monthlyCost <= lvl.MaxMonthlyUSD { + passedRequirements = append(passedRequirements, "Cost requirement met") + } else { + failedRequirements = append(failedRequirements, + fmt.Sprintf("Cost: $%d/month (max allowed: $%d/month)", monthlyCost, lvl.MaxMonthlyUSD)) + } + + // Check component requirements + componentFeedback := validateComponentRequirements(lvl, design) + if len(componentFeedback.Failed) > 0 { + failedRequirements = append(failedRequirements, componentFeedback.Failed...) + } + if len(componentFeedback.Passed) > 0 { + passedRequirements = append(passedRequirements, componentFeedback.Passed...) + } + + // Determine if passed + passed := len(failedRequirements) == 0 + + // Calculate score (0-100) + score := calculateScore(len(passedRequirements), len(failedRequirements), metrics) + + // Build feedback + if passed { + feedback = append(feedback, "Level completed successfully!") + feedback = append(feedback, "") + feedback = append(feedback, passedRequirements...) + } else { + feedback = append(feedback, "Level failed - requirements not met:") + feedback = append(feedback, "") + feedback = append(feedback, failedRequirements...) + if len(passedRequirements) > 0 { + feedback = append(feedback, "") + feedback = append(feedback, "Requirements passed:") + feedback = append(feedback, passedRequirements...) + } + } + + return passed, score, feedback +} + +type ComponentValidationResult struct { + Passed []string + Failed []string +} + +// validateComponentRequirements checks mustInclude, mustNotInclude, etc. +func validateComponentRequirements(lvl *level.Level, design design.Design) ComponentValidationResult { + result := ComponentValidationResult{} + + // Build map of component types in design + componentTypes := make(map[string]int) + for _, node := range design.Nodes { + componentTypes[node.Type]++ + } + + // Check mustInclude requirements + for _, required := range lvl.MustInclude { + if count, exists := componentTypes[required]; exists && count > 0 { + result.Passed = append(result.Passed, fmt.Sprintf("Required component '%s' included", required)) + } else { + result.Failed = append(result.Failed, fmt.Sprintf("Missing required component: '%s'", required)) + } + } + + // Check mustNotInclude requirements + for _, forbidden := range lvl.MustNotInclude { + if count, exists := componentTypes[forbidden]; exists && count > 0 { + result.Failed = append(result.Failed, fmt.Sprintf("Forbidden component used: '%s'", forbidden)) + } + } + + // Check minReplicas requirements + for component, minCount := range lvl.MinReplicas { + if count, exists := componentTypes[component]; exists && count >= minCount { + result.Passed = append(result.Passed, fmt.Sprintf("Sufficient '%s' replicas (%d)", component, count)) + } else { + actualCount := 0 + if exists { + actualCount = count + } + result.Failed = append(result.Failed, + fmt.Sprintf("Insufficient '%s' replicas: %d (minimum: %d)", component, actualCount, minCount)) + } + } + + return result +} + +// calculateScore computes a score from 0-100 based on performance +func calculateScore(passedCount, failedCount int, metrics map[string]interface{}) int { + if failedCount > 0 { + // Failed level - score based on how many requirements passed + return (passedCount * 100) / (passedCount + failedCount) + } + + // Passed level - bonus points for performance + baseScore := 70 // Base score for passing + + // Performance bonuses (up to 30 points) + performanceBonus := 0 + + // Throughput bonus (higher throughput = better) + if throughput, ok := metrics["throughput"].(int); ok && throughput > 0 { + performanceBonus += min(10, throughput/100) // 1 point per 100 RPS, max 10 + } + + // Availability bonus (higher availability = better) + if availability, ok := metrics["availability"].(float64); ok { + if availability >= 99.9 { + performanceBonus += 10 + } else if availability >= 99.5 { + performanceBonus += 5 + } + } + + // Cost efficiency bonus (lower cost = better) + if cost, ok := metrics["cost_monthly"].(int); ok && cost > 0 { + if cost <= 50 { + performanceBonus += 10 + } else if cost <= 100 { + performanceBonus += 5 + } + } + + return min(100, baseScore+performanceBonus) +} + +// Helper function +func min(a, b int) int { + if a < b { + return a + } + return b +} diff --git a/router/router.go b/router/router.go index 0fc8d1b..1601fd5 100644 --- a/router/router.go +++ b/router/router.go @@ -18,6 +18,7 @@ func SetupRoutes(tmpl *template.Template) *http.ServeMux { mux.Handle("/simulate", auth.RequireAuth(&handlers.SimulationHandler{})) mux.HandleFunc("/login", auth.LoginHandler) mux.HandleFunc("/callback", auth.CallbackHandler) + mux.HandleFunc("/ws", handlers.Messages) return mux } diff --git a/static/app.js b/static/app.js index c5b815b..eef7b0c 100644 --- a/static/app.js +++ b/static/app.js @@ -91,9 +91,54 @@ export class CanvasApp { node.y = y; }); - this.runButton.addEventListener('click', () => { + this.runButton.addEventListener('click', async () => { const designData = this.exportDesign(); - console.log(JSON.stringify(designData)) + + // Try to get level info from URL or page context + const levelInfo = this.getLevelInfo(); + + const requestBody = { + design: designData, + ...levelInfo + }; + + console.log('Sending design to simulation:', JSON.stringify(requestBody)); + + // Disable button and show loading state + this.runButton.disabled = true; + this.runButton.textContent = 'Running Simulation...'; + + try { + const response = await fetch('/simulate', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(requestBody) + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const result = await response.json(); + + if (result.Success) { + console.log('Simulation successful:', result); + this.showResults(result); + } else { + console.error('Simulation failed:', result.Error); + this.showError(result.Error || 'Simulation failed'); + } + + } catch (error) { + console.error('Network error:', error); + this.showError('Failed to run simulation: ' + error.message); + } finally { + // Re-enable button + this.runButton.disabled = false; + this.runButton.textContent = 'Test Design'; + } }); this.canvas.addEventListener('click', () => { @@ -267,4 +312,57 @@ export class CanvasApp { return { nodes, connections }; } + + getLevelInfo() { + // Try to extract level info from URL path like /play/url-shortener + const pathParts = window.location.pathname.split('/'); + if (pathParts.length >= 3 && pathParts[1] === 'play') { + const levelName = decodeURIComponent(pathParts[2]); + return { + levelName: levelName, + difficulty: 'easy' // Default difficulty, could be enhanced later + }; + } + return {}; + } + + showResults(result) { + const metrics = result.Metrics; + let message = ''; + + // Level validation results + if (result.LevelName) { + if (result.Passed) { + message += `Level "${result.LevelName}" PASSED!\n`; + message += `Score: ${result.Score}/100\n\n`; + } else { + message += `Level "${result.LevelName}" FAILED\n`; + message += `Score: ${result.Score}/100\n\n`; + } + + // Add detailed feedback + if (result.Feedback && result.Feedback.length > 0) { + message += result.Feedback.join('\n') + '\n\n'; + } + } else { + message += `Simulation Complete!\n\n`; + } + + // Performance metrics + message += `Performance Metrics:\n`; + message += `• Throughput: ${metrics.throughput} req/sec\n`; + message += `• Avg Latency: ${metrics.latency_avg}ms\n`; + message += `• Availability: ${metrics.availability.toFixed(1)}%\n`; + message += `• Monthly Cost: $${metrics.cost_monthly}\n\n`; + message += `Timeline: ${result.Timeline.length} ticks simulated`; + + alert(message); + + // TODO: Later replace with redirect to results page or modal + console.log('Full simulation data:', result); + } + + showError(errorMessage) { + alert(`Simulation Error:\n\n${errorMessage}\n\nPlease check your design and try again.`); + } } diff --git a/static/plugins/database.js b/static/plugins/database.js index 1b443b7..a2217d4 100644 --- a/static/plugins/database.js +++ b/static/plugins/database.js @@ -5,6 +5,8 @@ PluginRegistry.register('database', { label: 'Database', props: [ { name: 'label', type: 'string', default: 'Database', group: 'label-group' }, - { name: 'replication', type: 'number', default: 1, group: 'db-group' } + { name: 'replication', type: 'number', default: 1, group: 'db-group' }, + { name: 'maxRPS', type: 'number', default: 1000, group: 'db-group' }, + { name: 'baseLatencyMs', type: 'number', default: 10, group: 'db-group' } ] }); diff --git a/static/plugins/messageQueue.js b/static/plugins/messageQueue.js index 8bae881..dece3ef 100644 --- a/static/plugins/messageQueue.js +++ b/static/plugins/messageQueue.js @@ -6,6 +6,7 @@ PluginRegistry.register('messageQueue', { props: [ { name: 'label', type: 'string', default: 'MQ', group: 'label-group' }, { name: 'queueCapacity', type: 'number', default: 10000, group: 'mq-group' }, - { name: 'retentionSeconds', type: 'number', default: 600, group: 'mq-group' } + { name: 'retentionSeconds', type: 'number', default: 600, group: 'mq-group' }, + { name: 'processingRate', type: 'number', default: 100, group: 'mq-group' } ] }); diff --git a/static/plugins/monitorAlerting.js b/static/plugins/monitorAlerting.js index 9cdbf53..80c5e73 100644 --- a/static/plugins/monitorAlerting.js +++ b/static/plugins/monitorAlerting.js @@ -6,6 +6,8 @@ PluginRegistry.register('monitoring/alerting', { props: [ { name: 'label', type: 'string', default: 'monitor', group: 'label-group' }, { name: 'tool', type: 'string', default: 'Prometheus', group: 'monitor-group' }, - { name: 'alertThreshold', type: 'number', default: 80, group: 'monitor-group' } + { name: 'alertMetric', type: 'string', default: 'latency', group: 'monitor-group' }, + { name: 'thresholdValue', type: 'number', default: 80, group: 'monitor-group' }, + { name: 'thresholdUnit', type: 'string', default: 'ms', group: 'monitor-group' } ] });