Browse Source

feat: Complete simulation engine with 10 components and level validation

COMPLETE SIMULATION SYSTEM IMPLEMENTATION

## New Simulation Components (7 added):
- Database: Read/write latency, replication overhead, RPS capacity
- Cache: In-memory caching with LRU/LFU/FIFO/Random eviction policies
- Message Queue: FIFO processing, retention, backpressure, processing rate
- Microservice: Auto-scaling, resource capacity, load balancing across instances
- Monitoring/Alerting: Multi-metric alerting (latency, errors, queue size)
- Third Party Service: External API reliability, rate limiting, failure modeling
- Data Pipeline: Batch processing with 10 transformation types

## Enhanced Existing Components:
- Web Server: Fixed property naming (rpsCapacity)
- CDN: Fixed property naming (ttl)
- Load Balancer: Maintained existing functionality
- Engine: Added smart topology-based entry point detection

## Level Validation System:
- Complete pass/fail game mechanics with scoring (0-100)
- Performance validation: throughput, latency, availability, cost
- Component validation: mustInclude, mustNotInclude, minReplicas
- Detailed feedback with specific requirement failures
- Smart scoring with performance bonuses

## Frontend Integration:
- Real simulation execution (replaced mock data)
- Level information extraction from URL paths
- Rich results display with pass/fail feedback
- Automatic entry node detection from design topology

## Infrastructure Updates:
- Design Schema: Added missing properties, fixed coordinate precision (float64)
- Authentication: GitHub OAuth protection for all game routes
- Error Handling: Comprehensive validation and user feedback
- Testing: 78 tests covering all components and integration scenarios

## Technical Achievements:
- 100% simulation component coverage (10/10 components)
- Realistic performance modeling for all component types
- Discrete-event simulation with proper state management
- Production-ready code without emojis
- Comprehensive test suite with integration testing

## Breaking Changes:
- Position coordinates now use float64 for precision
- /simulate endpoint now requires authentication
- Request/response format updated for level validation

This completes the core simulation engine implementation and enables
a complete educational game experience for learning system design.
main
Stephanie Gredell 5 months ago
parent
commit
c7e0307f08
  1. 13
      internal/design/design.go
  2. 180
      internal/simulation/cache.go
  3. 319
      internal/simulation/cache_test.go
  4. 2
      internal/simulation/cdn.go
  5. 2
      internal/simulation/cdn_test.go
  6. 61
      internal/simulation/database.go
  7. 139
      internal/simulation/database_test.go
  8. 203
      internal/simulation/datapipeline.go
  9. 396
      internal/simulation/datapipeline_test.go
  10. 14
      internal/simulation/engine.go
  11. 863
      internal/simulation/engine_test.go
  12. 115
      internal/simulation/messagequeue.go
  13. 329
      internal/simulation/messagequeue_test.go
  14. 162
      internal/simulation/microservice.go
  15. 286
      internal/simulation/microservice_test.go
  16. 221
      internal/simulation/monitoring.go
  17. 411
      internal/simulation/monitoring_test.go
  18. 55
      internal/simulation/testdata/cache_design.json
  19. 35
      internal/simulation/testdata/database_design.json
  20. 188
      internal/simulation/testdata/datapipeline_design.json
  21. 53
      internal/simulation/testdata/messagequeue_design.json
  22. 96
      internal/simulation/testdata/microservice_design.json
  23. 127
      internal/simulation/testdata/monitoring_design.json
  24. 2
      internal/simulation/testdata/simple_design.json
  25. 164
      internal/simulation/testdata/thirdpartyservice_design.json
  26. 219
      internal/simulation/thirdpartyservice.go
  27. 382
      internal/simulation/thirdpartyservice_test.go
  28. 2
      internal/simulation/webserver.go
  29. 422
      router/handlers/simulation.go
  30. 1
      router/router.go
  31. 102
      static/app.js
  32. 4
      static/plugins/database.js
  33. 3
      static/plugins/messageQueue.js
  34. 4
      static/plugins/monitorAlerting.js

13
internal/design/design.go

@ -10,8 +10,8 @@ type Node struct { @@ -10,8 +10,8 @@ type Node struct {
}
type Position struct {
X int `json:"x"`
Y int `json:"y"`
X float64 `json:"x"`
Y float64 `json:"y"`
}
type Connection struct {
@ -46,8 +46,10 @@ type CDN struct { @@ -46,8 +46,10 @@ type CDN struct {
}
type Database struct {
Label string `json:"label"`
Replication int `json:"replication"`
Label string `json:"label"`
Replication int `json:"replication"`
MaxRPS int `json:"maxRPS"`
BaseLatencyMs int `json:"baseLatencyMs"`
}
type DataPipeline struct {
@ -65,13 +67,14 @@ type MessageQueue struct { @@ -65,13 +67,14 @@ type MessageQueue struct {
Label string `json:"label"`
QueueCapacity int `json:"queueCapacity"`
RetentionSeconds int `json:"retentionSeconds"`
ProcessingRate int `json:"processingRate"`
}
type Microservice struct {
Label string `json:"label"`
InstanceCount int `json:"instanceCount"`
CPU int `json:"cpu"`
RAMGb int `json:"ramGb"`
RamGb int `json:"ramGb"`
RPSCapacity int `json:"rpsCapacity"`
MonthlyUSD int `json:"monthlyUsd"`
ScalingStrategy string `json:"scalingStrategy"`

180
internal/simulation/cache.go

@ -0,0 +1,180 @@ @@ -0,0 +1,180 @@
package simulation
import (
"time"
)
type CacheLogic struct{}
type CacheEntry struct {
Data string
Timestamp int
AccessTime int
AccessCount int
InsertOrder int
}
func (c CacheLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
// Extract cache properties
cacheTTL := int(AsFloat64(props["cacheTTL"]))
if cacheTTL == 0 {
cacheTTL = 300000 // default 5 minutes in ms
}
maxEntries := int(AsFloat64(props["maxEntries"]))
if maxEntries == 0 {
maxEntries = 1000 // default max entries
}
evictionPolicy := AsString(props["evictionPolicy"])
if evictionPolicy == "" {
evictionPolicy = "LRU" // default eviction policy
}
// Initialize cache data structures in props
cacheData, ok := props["_cacheData"].(map[string]*CacheEntry)
if !ok {
cacheData = make(map[string]*CacheEntry)
props["_cacheData"] = cacheData
}
insertCounter, ok := props["_insertCounter"].(int)
if !ok {
insertCounter = 0
}
// Current timestamp for this tick
currentTime := tick * 100 // assuming 100ms per tick
// Clean up expired entries first
c.cleanExpiredEntries(cacheData, currentTime, cacheTTL)
output := []*Request{}
for _, req := range queue {
cacheKey := req.ID + "-" + req.Type // Use request ID and type as cache key
// Check for cache hit
entry, hit := cacheData[cacheKey]
if hit && !c.isExpired(entry, currentTime, cacheTTL) {
// Cache hit - return immediately with minimal latency
reqCopy := *req
reqCopy.LatencyMS += 1 // 1ms for in-memory access
reqCopy.Path = append(reqCopy.Path, "cache-hit")
// Update access tracking for eviction policies
entry.AccessTime = currentTime
entry.AccessCount++
output = append(output, &reqCopy)
} else {
// Cache miss - forward request downstream
reqCopy := *req
reqCopy.Path = append(reqCopy.Path, "cache-miss")
// For simulation purposes, we'll cache the "response" immediately
// In a real system, this would happen when the response comes back
insertCounter++
newEntry := &CacheEntry{
Data: "cached-data", // In real implementation, this would be the response data
Timestamp: currentTime,
AccessTime: currentTime,
AccessCount: 1,
InsertOrder: insertCounter,
}
// First check if we need to evict before adding
if len(cacheData) >= maxEntries {
c.evictEntry(cacheData, evictionPolicy)
}
// Now add the new entry
cacheData[cacheKey] = newEntry
output = append(output, &reqCopy)
}
}
// Update insert counter in props
props["_insertCounter"] = insertCounter
return output, true
}
func (c CacheLogic) cleanExpiredEntries(cacheData map[string]*CacheEntry, currentTime, ttl int) {
for key, entry := range cacheData {
if c.isExpired(entry, currentTime, ttl) {
delete(cacheData, key)
}
}
}
func (c CacheLogic) isExpired(entry *CacheEntry, currentTime, ttl int) bool {
return (currentTime - entry.Timestamp) > ttl
}
func (c CacheLogic) evictEntry(cacheData map[string]*CacheEntry, policy string) {
if len(cacheData) == 0 {
return
}
var keyToEvict string
switch policy {
case "LRU":
// Evict least recently used
oldestTime := int(^uint(0) >> 1) // Max int
for key, entry := range cacheData {
if entry.AccessTime < oldestTime {
oldestTime = entry.AccessTime
keyToEvict = key
}
}
case "LFU":
// Evict least frequently used
minCount := int(^uint(0) >> 1) // Max int
for key, entry := range cacheData {
if entry.AccessCount < minCount {
minCount = entry.AccessCount
keyToEvict = key
}
}
case "FIFO":
// Evict first in (oldest insert order)
minOrder := int(^uint(0) >> 1) // Max int
for key, entry := range cacheData {
if entry.InsertOrder < minOrder {
minOrder = entry.InsertOrder
keyToEvict = key
}
}
case "random":
// Evict random entry
keys := make([]string, 0, len(cacheData))
for key := range cacheData {
keys = append(keys, key)
}
if len(keys) > 0 {
// Use timestamp as pseudo-random seed
seed := time.Now().UnixNano()
keyToEvict = keys[seed%int64(len(keys))]
}
default:
// Default to LRU
oldestTime := int(^uint(0) >> 1)
for key, entry := range cacheData {
if entry.AccessTime < oldestTime {
oldestTime = entry.AccessTime
keyToEvict = key
}
}
}
if keyToEvict != "" {
delete(cacheData, keyToEvict)
}
}

319
internal/simulation/cache_test.go

@ -0,0 +1,319 @@ @@ -0,0 +1,319 @@
package simulation
import (
"testing"
)
func TestCacheLogic_CacheHitMiss(t *testing.T) {
cache := CacheLogic{}
props := map[string]any{
"cacheTTL": 10000, // 10 seconds
"maxEntries": 100,
"evictionPolicy": "LRU",
}
// First request should be a miss
req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0, Path: []string{"start"}}}
output1, alive := cache.Tick(props, req1, 1)
if !alive {
t.Errorf("Cache should be alive")
}
if len(output1) != 1 {
t.Errorf("Expected 1 output request, got %d", len(output1))
}
// Should be cache miss
if output1[0].LatencyMS != 0 { // No latency added for miss
t.Errorf("Expected 0ms latency for cache miss, got %dms", output1[0].LatencyMS)
}
// Check path contains cache-miss
found := false
for _, pathItem := range output1[0].Path {
if pathItem == "cache-miss" {
found = true
break
}
}
if !found {
t.Errorf("Expected cache-miss in path, got %v", output1[0].Path)
}
// Second identical request should be a hit
req2 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0, Path: []string{"start"}}}
output2, _ := cache.Tick(props, req2, 2)
if len(output2) != 1 {
t.Errorf("Expected 1 output request, got %d", len(output2))
}
// Should be cache hit with 1ms latency
if output2[0].LatencyMS != 1 {
t.Errorf("Expected 1ms latency for cache hit, got %dms", output2[0].LatencyMS)
}
// Check path contains cache-hit
found = false
for _, pathItem := range output2[0].Path {
if pathItem == "cache-hit" {
found = true
break
}
}
if !found {
t.Errorf("Expected cache-hit in path, got %v", output2[0].Path)
}
}
func TestCacheLogic_TTLExpiration(t *testing.T) {
cache := CacheLogic{}
props := map[string]any{
"cacheTTL": 1000, // 1 second
"maxEntries": 100,
"evictionPolicy": "LRU",
}
// First request - cache miss
req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req1, 1)
// Second request within TTL - cache hit
req2 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
output2, _ := cache.Tick(props, req2, 5) // 5 * 100ms = 500ms later
if output2[0].LatencyMS != 1 {
t.Errorf("Expected cache hit (1ms), got %dms", output2[0].LatencyMS)
}
// Third request after TTL expiration - cache miss
req3 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
output3, _ := cache.Tick(props, req3, 15) // 15 * 100ms = 1500ms later (expired)
if output3[0].LatencyMS != 0 {
t.Errorf("Expected cache miss (0ms) after TTL expiration, got %dms", output3[0].LatencyMS)
}
}
func TestCacheLogic_MaxEntriesEviction(t *testing.T) {
cache := CacheLogic{}
props := map[string]any{
"cacheTTL": 10000,
"maxEntries": 2, // Small cache size
"evictionPolicy": "LRU",
}
// Add first entry
req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req1, 1)
// Add second entry
req2 := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req2, 2)
// Verify both are cached
req1Check := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
output1Check, _ := cache.Tick(props, req1Check, 3)
if output1Check[0].LatencyMS != 1 {
t.Errorf("Expected cache hit for req1, got %dms latency", output1Check[0].LatencyMS)
}
req2Check := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
output2Check, _ := cache.Tick(props, req2Check, 4)
if output2Check[0].LatencyMS != 1 {
t.Errorf("Expected cache hit for req2, got %dms latency", output2Check[0].LatencyMS)
}
// Add third entry (should evict LRU entry)
req3 := []*Request{{ID: "req3", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req3, 5)
// req1 was accessed at tick 3, req2 at tick 4, so req1 should be evicted
req1CheckAgain := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
output1, _ := cache.Tick(props, req1CheckAgain, 6)
if output1[0].LatencyMS != 0 {
t.Errorf("Expected cache miss for LRU evicted entry, got %dms latency", output1[0].LatencyMS)
}
// After adding req1 back, the cache should be at capacity with different items
// We don't test further to avoid complex cascading eviction scenarios
}
func TestCacheLogic_LRUEviction(t *testing.T) {
cache := CacheLogic{}
props := map[string]any{
"cacheTTL": 10000,
"maxEntries": 2,
"evictionPolicy": "LRU",
}
// Add two entries
req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req1, 1)
req2 := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req2, 2)
// Access first entry (make it recently used)
req1Access := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req1Access, 3)
// Add third entry (should evict req2, since req1 was more recently accessed)
req3 := []*Request{{ID: "req3", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req3, 4)
// Verify that req2 was evicted (should be cache miss)
req2Check := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
output2, _ := cache.Tick(props, req2Check, 5)
if output2[0].LatencyMS != 0 {
t.Errorf("Expected cache miss for LRU evicted entry, got %dms latency", output2[0].LatencyMS)
}
// After adding req2 back, the cache should contain {req2, req1} or {req2, req3}
// depending on LRU logic. We don't test further to avoid cascading evictions.
}
func TestCacheLogic_FIFOEviction(t *testing.T) {
cache := CacheLogic{}
props := map[string]any{
"cacheTTL": 10000,
"maxEntries": 2,
"evictionPolicy": "FIFO",
}
// Add two entries
req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req1, 1)
req2 := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req2, 2)
// Access first entry multiple times (shouldn't matter for FIFO)
req1Access := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req1Access, 3)
cache.Tick(props, req1Access, 4)
// Add third entry (should evict req1, the first inserted)
req3 := []*Request{{ID: "req3", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req3, 5)
// Check that req1 was evicted (first in, first out)
req1Check := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
output1, _ := cache.Tick(props, req1Check, 6)
if output1[0].LatencyMS != 0 {
t.Errorf("Expected cache miss for FIFO evicted entry, got %dms latency", output1[0].LatencyMS)
}
// After adding req1 back, the cache should contain {req2, req1} or {req3, req1}
// depending on FIFO logic. We don't test further to avoid cascading evictions.
}
func TestCacheLogic_DefaultValues(t *testing.T) {
cache := CacheLogic{}
// Empty props should use defaults
props := map[string]any{}
req := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
output, _ := cache.Tick(props, req, 1)
if len(output) != 1 {
t.Errorf("Expected 1 output request")
}
// Should be cache miss with 0ms latency
if output[0].LatencyMS != 0 {
t.Errorf("Expected 0ms latency for cache miss with defaults, got %dms", output[0].LatencyMS)
}
// Second request should be cache hit
req2 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
output2, _ := cache.Tick(props, req2, 2)
if output2[0].LatencyMS != 1 {
t.Errorf("Expected 1ms latency for cache hit, got %dms", output2[0].LatencyMS)
}
}
func TestCacheLogic_SimpleEviction(t *testing.T) {
cache := CacheLogic{}
props := map[string]any{
"cacheTTL": 10000,
"maxEntries": 1, // Only 1 entry allowed
"evictionPolicy": "LRU",
}
// Add first entry
req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
output1, _ := cache.Tick(props, req1, 1)
if output1[0].LatencyMS != 0 {
t.Errorf("First request should be cache miss, got %dms", output1[0].LatencyMS)
}
// Check it's cached
req1Again := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
output1Again, _ := cache.Tick(props, req1Again, 2)
if output1Again[0].LatencyMS != 1 {
t.Errorf("Second request should be cache hit, got %dms", output1Again[0].LatencyMS)
}
// Add second entry (should evict first)
req2 := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
output2, _ := cache.Tick(props, req2, 3)
if output2[0].LatencyMS != 0 {
t.Errorf("New request should be cache miss, got %dms", output2[0].LatencyMS)
}
// Check that first entry is now evicted
req1Final := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
output1Final, _ := cache.Tick(props, req1Final, 4)
if output1Final[0].LatencyMS != 0 {
t.Errorf("Evicted entry should be cache miss, got %dms", output1Final[0].LatencyMS)
}
// Check that second entry is now also evicted (since req1 was re-added in step 4)
req2Again := []*Request{{ID: "req2", Type: "GET", LatencyMS: 0}}
output2Again, _ := cache.Tick(props, req2Again, 5)
if output2Again[0].LatencyMS != 0 {
t.Errorf("Re-evicted entry should be cache miss, got %dms", output2Again[0].LatencyMS)
}
}
func TestCacheLogic_DifferentRequestTypes(t *testing.T) {
cache := CacheLogic{}
props := map[string]any{
"cacheTTL": 10000,
"maxEntries": 100,
"evictionPolicy": "LRU",
}
// Same ID but different type should be different cache entries
req1 := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
cache.Tick(props, req1, 1)
req2 := []*Request{{ID: "req1", Type: "POST", LatencyMS: 0}}
output2, _ := cache.Tick(props, req2, 2)
// Should be cache miss since different type
if output2[0].LatencyMS != 0 {
t.Errorf("Expected cache miss for different request type, got %dms latency", output2[0].LatencyMS)
}
// Original GET should still be cached
req1Again := []*Request{{ID: "req1", Type: "GET", LatencyMS: 0}}
output1, _ := cache.Tick(props, req1Again, 3)
if output1[0].LatencyMS != 1 {
t.Errorf("Expected cache hit for original request type, got %dms latency", output1[0].LatencyMS)
}
}

2
internal/simulation/cdn.go

@ -5,7 +5,7 @@ type CDNLogic struct{} @@ -5,7 +5,7 @@ type CDNLogic struct{}
func (c CDNLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
// read the ttl for cached content
ttl := int(AsFloat64(props["ttlMs"]))
ttl := int(AsFloat64(props["ttl"]))
// retrieve the cdn's cache from props
cache, ok := props["_cache"].(map[string]int)

2
internal/simulation/cdn_test.go

@ -9,7 +9,7 @@ func TestCDNLogic(t *testing.T) { @@ -9,7 +9,7 @@ func TestCDNLogic(t *testing.T) {
cdn := CDNLogic{}
cache := map[string]int{} // shared mutable cache
props := map[string]any{
"ttlMs": float64(1000),
"ttl": float64(1000),
"_cache": cache,
}

61
internal/simulation/database.go

@ -0,0 +1,61 @@ @@ -0,0 +1,61 @@
package simulation
type DatabaseLogic struct{}
func (d DatabaseLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
// Extract database properties
replication := int(AsFloat64(props["replication"]))
if replication == 0 {
replication = 1 // default
}
// Database capacity (could be based on instance size or explicit RPS)
maxRPS := int(AsFloat64(props["maxRPS"]))
if maxRPS == 0 {
maxRPS = 1000 // default capacity
}
// Base latency for database operations
baseLatencyMs := int(AsFloat64(props["baseLatencyMs"]))
if baseLatencyMs == 0 {
baseLatencyMs = 10 // default 10ms for local DB operations
}
// Process requests up to capacity
toProcess := queue
if len(queue) > maxRPS {
toProcess = queue[:maxRPS]
// TODO: Could add queue overflow logic here
}
output := []*Request{}
for _, req := range toProcess {
// Add database latency to the request
reqCopy := *req
// Simulate different operation types and their latencies
operationLatency := baseLatencyMs
// Simple heuristic: reads are faster than writes
if req.Type == "GET" || req.Type == "READ" {
operationLatency = baseLatencyMs
} else if req.Type == "POST" || req.Type == "WRITE" {
operationLatency = baseLatencyMs * 2 // writes take longer
}
// Replication factor affects write latency
if req.Type == "POST" || req.Type == "WRITE" {
operationLatency += (replication - 1) * 5 // 5ms per replica
}
reqCopy.LatencyMS += operationLatency
reqCopy.Path = append(reqCopy.Path, "database-processed")
output = append(output, &reqCopy)
}
// Database health (could simulate failures, connection issues, etc.)
// For now, assume always healthy
return output, true
}

139
internal/simulation/database_test.go

@ -0,0 +1,139 @@ @@ -0,0 +1,139 @@
package simulation
import (
"testing"
)
func TestDatabaseLogic_BasicProcessing(t *testing.T) {
db := DatabaseLogic{}
props := map[string]any{
"replication": 2,
"maxRPS": 100,
"baseLatencyMs": 15,
}
// Create test requests
reqs := []*Request{
{ID: "req1", Type: "GET", LatencyMS: 0, Path: []string{"start"}},
{ID: "req2", Type: "POST", LatencyMS: 0, Path: []string{"start"}},
}
output, alive := db.Tick(props, reqs, 1)
if !alive {
t.Errorf("Database should be alive")
}
if len(output) != 2 {
t.Errorf("Expected 2 output requests, got %d", len(output))
}
// Check read latency (base latency)
readReq := output[0]
if readReq.LatencyMS != 15 {
t.Errorf("Expected read latency 15ms, got %dms", readReq.LatencyMS)
}
// Check write latency (base * 2 + replication penalty)
writeReq := output[1]
expectedWriteLatency := 15*2 + (2-1)*5 // 30 + 5 = 35ms
if writeReq.LatencyMS != expectedWriteLatency {
t.Errorf("Expected write latency %dms, got %dms", expectedWriteLatency, writeReq.LatencyMS)
}
}
func TestDatabaseLogic_CapacityLimit(t *testing.T) {
db := DatabaseLogic{}
props := map[string]any{
"maxRPS": 2,
"baseLatencyMs": 10,
}
// Create more requests than capacity
reqs := []*Request{
{ID: "req1", Type: "GET"},
{ID: "req2", Type: "GET"},
{ID: "req3", Type: "GET"}, // This should be dropped
}
output, _ := db.Tick(props, reqs, 1)
if len(output) != 2 {
t.Errorf("Expected capacity limit of 2, but processed %d requests", len(output))
}
}
func TestDatabaseLogic_DefaultValues(t *testing.T) {
db := DatabaseLogic{}
// Empty props should use defaults
props := map[string]any{}
reqs := []*Request{
{ID: "req1", Type: "GET", LatencyMS: 0},
}
output, _ := db.Tick(props, reqs, 1)
if len(output) != 1 {
t.Errorf("Expected 1 output request")
}
// Should use default 10ms base latency
if output[0].LatencyMS != 10 {
t.Errorf("Expected default latency 10ms, got %dms", output[0].LatencyMS)
}
}
func TestDatabaseLogic_ReplicationEffect(t *testing.T) {
db := DatabaseLogic{}
// Test with high replication
props := map[string]any{
"replication": 5,
"baseLatencyMs": 10,
}
reqs := []*Request{
{ID: "req1", Type: "POST", LatencyMS: 0},
}
output, _ := db.Tick(props, reqs, 1)
if len(output) != 1 {
t.Errorf("Expected 1 output request")
}
// Write latency: base*2 + (replication-1)*5 = 10*2 + (5-1)*5 = 20 + 20 = 40ms
expectedLatency := 10*2 + (5-1)*5
if output[0].LatencyMS != expectedLatency {
t.Errorf("Expected latency %dms with replication=5, got %dms", expectedLatency, output[0].LatencyMS)
}
}
func TestDatabaseLogic_ReadVsWrite(t *testing.T) {
db := DatabaseLogic{}
props := map[string]any{
"replication": 1,
"baseLatencyMs": 20,
}
readReq := []*Request{{ID: "read", Type: "GET", LatencyMS: 0}}
writeReq := []*Request{{ID: "write", Type: "POST", LatencyMS: 0}}
readOutput, _ := db.Tick(props, readReq, 1)
writeOutput, _ := db.Tick(props, writeReq, 1)
// Read should be base latency
if readOutput[0].LatencyMS != 20 {
t.Errorf("Expected read latency 20ms, got %dms", readOutput[0].LatencyMS)
}
// Write should be double base latency (no replication penalty with replication=1)
if writeOutput[0].LatencyMS != 40 {
t.Errorf("Expected write latency 40ms, got %dms", writeOutput[0].LatencyMS)
}
}

203
internal/simulation/datapipeline.go

@ -0,0 +1,203 @@ @@ -0,0 +1,203 @@
package simulation
type DataPipelineLogic struct{}
type DataBatch struct {
ID string
RecordCount int
Timestamp int
ProcessingMS int
}
type PipelineState struct {
ProcessingQueue []DataBatch
CompletedBatches int
TotalRecords int
BacklogSize int
}
func (d DataPipelineLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
// Extract data pipeline properties
batchSize := int(AsFloat64(props["batchSize"]))
if batchSize == 0 {
batchSize = 500 // default batch size
}
transformation := AsString(props["transformation"])
if transformation == "" {
transformation = "map" // default transformation
}
// Get pipeline state from props (persistent state)
state, ok := props["_pipelineState"].(PipelineState)
if !ok {
state = PipelineState{
ProcessingQueue: []DataBatch{},
CompletedBatches: 0,
TotalRecords: 0,
BacklogSize: 0,
}
}
currentTime := tick * 100 // Convert tick to milliseconds
// Convert incoming requests to data batches
if len(queue) > 0 {
// Group requests into batches
batches := d.createBatches(queue, batchSize, currentTime, transformation)
// Add batches to processing queue
state.ProcessingQueue = append(state.ProcessingQueue, batches...)
state.BacklogSize += len(queue)
}
// Process batches that are ready (completed their processing time)
output := []*Request{}
remainingBatches := []DataBatch{}
for _, batch := range state.ProcessingQueue {
if currentTime >= batch.Timestamp+batch.ProcessingMS {
// Batch is complete - create output requests
for i := 0; i < batch.RecordCount; i++ {
processedReq := &Request{
ID: batch.ID + "-record-" + string(rune('0'+i)),
Timestamp: batch.Timestamp,
LatencyMS: batch.ProcessingMS,
Origin: "data-pipeline",
Type: "PROCESSED",
Path: []string{"pipeline-" + transformation},
}
output = append(output, processedReq)
}
state.CompletedBatches++
state.TotalRecords += batch.RecordCount
} else {
// Batch still processing
remainingBatches = append(remainingBatches, batch)
}
}
state.ProcessingQueue = remainingBatches
state.BacklogSize = len(remainingBatches) * batchSize
// Update persistent state
props["_pipelineState"] = state
// Health check: pipeline is healthy if backlog is not too large
maxBacklogSize := batchSize * 20 // Allow up to 20 batches in backlog
healthy := state.BacklogSize < maxBacklogSize
return output, healthy
}
// createBatches groups requests into batches and calculates processing time
func (d DataPipelineLogic) createBatches(requests []*Request, batchSize int, timestamp int, transformation string) []DataBatch {
batches := []DataBatch{}
for i := 0; i < len(requests); i += batchSize {
end := i + batchSize
if end > len(requests) {
end = len(requests)
}
recordCount := end - i
processingTime := d.calculateProcessingTime(recordCount, transformation)
batch := DataBatch{
ID: "batch-" + string(rune('A'+len(batches))),
RecordCount: recordCount,
Timestamp: timestamp,
ProcessingMS: processingTime,
}
batches = append(batches, batch)
}
return batches
}
// calculateProcessingTime determines how long a batch takes to process based on transformation type
func (d DataPipelineLogic) calculateProcessingTime(recordCount int, transformation string) int {
// Base processing time per record
baseTimePerRecord := d.getTransformationComplexity(transformation)
// Total time scales with record count but with some economies of scale
totalTime := float64(recordCount) * baseTimePerRecord
// Add batch overhead (setup, teardown, I/O)
batchOverhead := d.getBatchOverhead(transformation)
totalTime += batchOverhead
// Apply economies of scale for larger batches (slightly more efficient)
if recordCount > 100 {
scaleFactor := 0.9 // 10% efficiency gain for large batches
totalTime *= scaleFactor
}
return int(totalTime)
}
// getTransformationComplexity returns base processing time per record in milliseconds
func (d DataPipelineLogic) getTransformationComplexity(transformation string) float64 {
switch transformation {
case "map":
return 1.0 // Simple field mapping
case "filter":
return 0.5 // Just evaluate conditions
case "sort":
return 3.0 // Sorting requires more compute
case "aggregate":
return 2.0 // Grouping and calculating aggregates
case "join":
return 5.0 // Most expensive - joining with other datasets
case "deduplicate":
return 2.5 // Hash-based deduplication
case "validate":
return 1.5 // Data validation and cleaning
case "enrich":
return 4.0 // Enriching with external data
case "compress":
return 1.2 // Compression processing
case "encrypt":
return 2.0 // Encryption overhead
default:
return 1.0 // Default to simple transformation
}
}
// getBatchOverhead returns fixed overhead time per batch in milliseconds
func (d DataPipelineLogic) getBatchOverhead(transformation string) float64 {
switch transformation {
case "map", "filter", "validate":
return 50.0 // Low overhead for simple operations
case "sort", "aggregate", "deduplicate":
return 200.0 // Medium overhead for complex operations
case "join", "enrich":
return 500.0 // High overhead for operations requiring external data
case "compress", "encrypt":
return 100.0 // Medium overhead for I/O operations
default:
return 100.0 // Default overhead
}
}
// Helper function to get pipeline statistics
func (d DataPipelineLogic) GetPipelineStats(props map[string]any) map[string]interface{} {
state, ok := props["_pipelineState"].(PipelineState)
if !ok {
return map[string]interface{}{
"completedBatches": 0,
"totalRecords": 0,
"backlogSize": 0,
"queuedBatches": 0,
}
}
return map[string]interface{}{
"completedBatches": state.CompletedBatches,
"totalRecords": state.TotalRecords,
"backlogSize": state.BacklogSize,
"queuedBatches": len(state.ProcessingQueue),
}
}

396
internal/simulation/datapipeline_test.go

@ -0,0 +1,396 @@ @@ -0,0 +1,396 @@
package simulation
import (
"testing"
)
func TestDataPipelineLogic_BasicProcessing(t *testing.T) {
logic := DataPipelineLogic{}
props := map[string]any{
"batchSize": 100.0,
"transformation": "map",
}
// Create 50 requests (less than batch size)
requests := make([]*Request, 50)
for i := range requests {
requests[i] = &Request{ID: string(rune('1' + i)), Type: "DATA", LatencyMS: 0}
}
// First tick - should create batch and start processing
output, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected data pipeline to be healthy")
}
// Should not have output yet (batch is still processing)
if len(output) != 0 {
t.Errorf("Expected no output during processing, got %d", len(output))
}
// Check that batch was created
state, ok := props["_pipelineState"].(PipelineState)
if !ok {
t.Error("Expected pipeline state to be created")
}
if len(state.ProcessingQueue) != 1 {
t.Errorf("Expected 1 batch in processing queue, got %d", len(state.ProcessingQueue))
}
if state.ProcessingQueue[0].RecordCount != 50 {
t.Errorf("Expected batch with 50 records, got %d", state.ProcessingQueue[0].RecordCount)
}
}
func TestDataPipelineLogic_BatchCompletion(t *testing.T) {
logic := DataPipelineLogic{}
props := map[string]any{
"batchSize": 10.0,
"transformation": "filter", // Fast transformation
}
// Create 5 requests
requests := make([]*Request, 5)
for i := range requests {
requests[i] = &Request{ID: string(rune('1' + i)), Type: "DATA", LatencyMS: 0}
}
// First tick - start processing
logic.Tick(props, requests, 1)
// Wait enough ticks for processing to complete
// Filter transformation should complete quickly
var output []*Request
var healthy bool
for tick := 2; tick <= 10; tick++ {
output, healthy = logic.Tick(props, []*Request{}, tick)
if len(output) > 0 {
break
}
}
if !healthy {
t.Error("Expected data pipeline to be healthy")
}
// Should have output matching input count
if len(output) != 5 {
t.Errorf("Expected 5 output records, got %d", len(output))
}
// Check output structure
for _, req := range output {
if req.Type != "PROCESSED" {
t.Errorf("Expected PROCESSED type, got %s", req.Type)
}
if req.Origin != "data-pipeline" {
t.Errorf("Expected data-pipeline origin, got %s", req.Origin)
}
if len(req.Path) == 0 || req.Path[0] != "pipeline-filter" {
t.Error("Expected path to indicate filter transformation")
}
}
}
func TestDataPipelineLogic_MultipleBatches(t *testing.T) {
logic := DataPipelineLogic{}
props := map[string]any{
"batchSize": 10.0,
"transformation": "map",
}
// Create 25 requests (should create 3 batches: 10, 10, 5)
requests := make([]*Request, 25)
for i := range requests {
requests[i] = &Request{ID: string(rune('1' + i)), Type: "DATA", LatencyMS: 0}
}
// First tick - create batches
output, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected data pipeline to be healthy")
}
if len(output) != 0 {
t.Error("Expected no immediate output")
}
// Check that 3 batches were created
state, ok := props["_pipelineState"].(PipelineState)
if !ok {
t.Error("Expected pipeline state to be created")
}
if len(state.ProcessingQueue) != 3 {
t.Errorf("Expected 3 batches in processing queue, got %d", len(state.ProcessingQueue))
}
// Verify batch sizes
expectedSizes := []int{10, 10, 5}
for i, batch := range state.ProcessingQueue {
if batch.RecordCount != expectedSizes[i] {
t.Errorf("Expected batch %d to have %d records, got %d",
i, expectedSizes[i], batch.RecordCount)
}
}
}
func TestDataPipelineLogic_TransformationComplexity(t *testing.T) {
logic := DataPipelineLogic{}
transformations := []string{"filter", "map", "sort", "aggregate", "join"}
for _, transformation := range transformations {
t.Run(transformation, func(t *testing.T) {
complexity := logic.getTransformationComplexity(transformation)
// Verify relative complexity ordering
switch transformation {
case "filter":
if complexity >= logic.getTransformationComplexity("map") {
t.Error("Filter should be simpler than map")
}
case "join":
if complexity <= logic.getTransformationComplexity("aggregate") {
t.Error("Join should be more complex than aggregate")
}
case "sort":
if complexity <= logic.getTransformationComplexity("map") {
t.Error("Sort should be more complex than map")
}
}
if complexity <= 0 {
t.Errorf("Expected positive complexity for %s", transformation)
}
})
}
}
func TestDataPipelineLogic_BatchOverhead(t *testing.T) {
logic := DataPipelineLogic{}
// Test different overhead levels
testCases := []struct {
transformation string
expectedRange [2]float64 // [min, max]
}{
{"map", [2]float64{0, 100}}, // Low overhead
{"join", [2]float64{300, 600}}, // High overhead
{"sort", [2]float64{150, 300}}, // Medium overhead
}
for _, tc := range testCases {
overhead := logic.getBatchOverhead(tc.transformation)
if overhead < tc.expectedRange[0] || overhead > tc.expectedRange[1] {
t.Errorf("Expected %s overhead between %.0f-%.0f, got %.0f",
tc.transformation, tc.expectedRange[0], tc.expectedRange[1], overhead)
}
}
}
func TestDataPipelineLogic_ProcessingTime(t *testing.T) {
logic := DataPipelineLogic{}
// Test that processing time scales with record count
smallBatch := logic.calculateProcessingTime(10, "map")
largeBatch := logic.calculateProcessingTime(100, "map")
if largeBatch <= smallBatch {
t.Error("Expected larger batch to take more time")
}
// Test that complex transformations take longer
simpleTime := logic.calculateProcessingTime(50, "filter")
complexTime := logic.calculateProcessingTime(50, "join")
if complexTime <= simpleTime {
t.Error("Expected complex transformation to take longer")
}
// Test economies of scale (large batches should be more efficient per record)
smallPerRecord := float64(smallBatch) / 10.0
largePerRecord := float64(largeBatch) / 100.0
if largePerRecord >= smallPerRecord {
t.Error("Expected economies of scale for larger batches")
}
}
func TestDataPipelineLogic_HealthCheck(t *testing.T) {
logic := DataPipelineLogic{}
props := map[string]any{
"batchSize": 10.0,
"transformation": "join", // Slow transformation
}
// Create a large number of requests to test backlog health
requests := make([]*Request, 300) // 30 batches (above healthy threshold)
for i := range requests {
requests[i] = &Request{ID: string(rune('1' + (i % 26))), Type: "DATA", LatencyMS: 0}
}
// First tick - should create many batches
output, healthy := logic.Tick(props, requests, 1)
// Should be unhealthy due to large backlog
if healthy {
t.Error("Expected data pipeline to be unhealthy with large backlog")
}
if len(output) != 0 {
t.Error("Expected no immediate output with slow transformation")
}
// Check backlog size
state, ok := props["_pipelineState"].(PipelineState)
if !ok {
t.Error("Expected pipeline state to be created")
}
if state.BacklogSize < 200 {
t.Errorf("Expected large backlog, got %d", state.BacklogSize)
}
}
func TestDataPipelineLogic_DefaultValues(t *testing.T) {
logic := DataPipelineLogic{}
// Empty props should use defaults
props := map[string]any{}
requests := []*Request{{ID: "1", Type: "DATA", LatencyMS: 0}}
output, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected pipeline to be healthy with default values")
}
if len(output) != 0 {
t.Error("Expected no immediate output")
}
// Should use default batch size and transformation
state, ok := props["_pipelineState"].(PipelineState)
if !ok {
t.Error("Expected pipeline state to be created with defaults")
}
if len(state.ProcessingQueue) != 1 {
t.Error("Expected one batch with default settings")
}
}
func TestDataPipelineLogic_PipelineStats(t *testing.T) {
logic := DataPipelineLogic{}
props := map[string]any{
"batchSize": 5.0,
"transformation": "filter",
}
// Initial stats should be empty
stats := logic.GetPipelineStats(props)
if stats["completedBatches"] != 0 {
t.Error("Expected initial completed batches to be 0")
}
// Process some data
requests := make([]*Request, 10)
for i := range requests {
requests[i] = &Request{ID: string(rune('1' + i)), Type: "DATA", LatencyMS: 0}
}
logic.Tick(props, requests, 1)
// Check stats after processing
stats = logic.GetPipelineStats(props)
if stats["queuedBatches"] != 2 {
t.Errorf("Expected 2 queued batches, got %v", stats["queuedBatches"])
}
if stats["backlogSize"] != 10 {
t.Errorf("Expected backlog size of 10, got %v", stats["backlogSize"])
}
}
func TestDataPipelineLogic_ContinuousProcessing(t *testing.T) {
logic := DataPipelineLogic{}
props := map[string]any{
"batchSize": 5.0,
"transformation": "map",
}
// Process multiple waves of data
totalOutput := 0
for wave := 0; wave < 3; wave++ {
requests := make([]*Request, 5)
for i := range requests {
requests[i] = &Request{ID: string(rune('A' + wave*5 + i)), Type: "DATA", LatencyMS: 0}
}
// Process each wave
for tick := wave*10 + 1; tick <= wave*10+5; tick++ {
var output []*Request
if tick == wave*10+1 {
output, _ = logic.Tick(props, requests, tick)
} else {
output, _ = logic.Tick(props, []*Request{}, tick)
}
totalOutput += len(output)
}
}
// Should have processed all data eventually
if totalOutput != 15 {
t.Errorf("Expected 15 total output records, got %d", totalOutput)
}
// Check final stats
stats := logic.GetPipelineStats(props)
if stats["totalRecords"] != 15 {
t.Errorf("Expected 15 total records processed, got %v", stats["totalRecords"])
}
}
func TestDataPipelineLogic_EmptyQueue(t *testing.T) {
logic := DataPipelineLogic{}
props := map[string]any{
"batchSize": 10.0,
"transformation": "map",
}
// Process empty queue
output, healthy := logic.Tick(props, []*Request{}, 1)
if !healthy {
t.Error("Expected pipeline to be healthy with empty queue")
}
if len(output) != 0 {
t.Error("Expected no output with empty queue")
}
// State should be initialized but empty
state, ok := props["_pipelineState"].(PipelineState)
if !ok {
t.Error("Expected pipeline state to be initialized")
}
if len(state.ProcessingQueue) != 0 {
t.Error("Expected empty processing queue")
}
}

14
internal/simulation/engine.go

@ -185,6 +185,20 @@ func GetLogicForType(t string) NodeLogic { @@ -185,6 +185,20 @@ func GetLogicForType(t string) NodeLogic {
return LoadBalancerLogic{}
case "cdn":
return CDNLogic{}
case "database":
return DatabaseLogic{}
case "cache":
return CacheLogic{}
case "messageQueue":
return MessageQueueLogic{}
case "microservice":
return MicroserviceLogic{}
case "monitoring/alerting":
return MonitoringLogic{}
case "third party service":
return ThirdPartyServiceLogic{}
case "data pipeline":
return DataPipelineLogic{}
default:
return nil
}

863
internal/simulation/engine_test.go

@ -1,6 +1,8 @@ @@ -1,6 +1,8 @@
package simulation
import (
"encoding/json"
"os"
"testing"
"systemdesigngame/internal/design"
@ -10,8 +12,8 @@ import ( @@ -10,8 +12,8 @@ import (
func TestSimpleChainSimulation(t *testing.T) {
d := design.Design{
Nodes: []design.Node{
{ID: "a", Type: "webserver", Props: map[string]any{"capacityRPS": 1, "baseLatencyMs": 10}},
{ID: "b", Type: "webserver", Props: map[string]any{"capacityRPS": 1, "baseLatencyMs": 10}},
{ID: "a", Type: "webserver", Props: map[string]any{"rpsCapacity": 1, "baseLatencyMs": 10}},
{ID: "b", Type: "webserver", Props: map[string]any{"rpsCapacity": 1, "baseLatencyMs": 10}},
},
Connections: []design.Connection{
{Source: "a", Target: "b"},
@ -49,8 +51,8 @@ func TestSimpleChainSimulation(t *testing.T) { @@ -49,8 +51,8 @@ func TestSimpleChainSimulation(t *testing.T) {
func TestSingleTickRouting(t *testing.T) {
d := design.Design{
Nodes: []design.Node{
{ID: "a", Type: "webserver", Props: map[string]any{"capacityRPS": 1.0, "baseLatencyMs": 10.0}},
{ID: "b", Type: "webserver", Props: map[string]any{"capacityRPS": 1.0, "baseLatencyMs": 10.0}},
{ID: "a", Type: "webserver", Props: map[string]any{"rpsCapacity": 1.0, "baseLatencyMs": 10.0}},
{ID: "b", Type: "webserver", Props: map[string]any{"rpsCapacity": 1.0, "baseLatencyMs": 10.0}},
},
Connections: []design.Connection{
{Source: "a", Target: "b"},
@ -85,7 +87,7 @@ func TestSingleTickRouting(t *testing.T) { @@ -85,7 +87,7 @@ func TestSingleTickRouting(t *testing.T) {
func TestHighRPSSimulation(t *testing.T) {
d := design.Design{
Nodes: []design.Node{
{ID: "entry", Type: "webserver", Props: map[string]any{"capacityRPS": 5000, "baseLatencyMs": 1}},
{ID: "entry", Type: "webserver", Props: map[string]any{"rpsCapacity": 5000, "baseLatencyMs": 1}},
},
Connections: []design.Connection{},
}
@ -106,3 +108,854 @@ func TestHighRPSSimulation(t *testing.T) { @@ -106,3 +108,854 @@ func TestHighRPSSimulation(t *testing.T) {
t.Errorf("expected %d total emitted requests, got %d", expected, totalEmitted)
}
}
func TestDatabaseIntegration(t *testing.T) {
design := design.Design{
Nodes: []design.Node{
{
ID: "webserver",
Type: "webserver",
Props: map[string]interface{}{
"rpsCapacity": 10,
},
},
{
ID: "database",
Type: "database",
Props: map[string]interface{}{
"replication": 2,
"maxRPS": 100,
"baseLatencyMs": 20,
},
},
},
Connections: []design.Connection{
{
Source: "webserver",
Target: "database",
},
},
}
engine := NewEngineFromDesign(design, 100)
engine.RPS = 5
engine.EntryNode = "webserver"
snapshots := engine.Run(3, 100)
if len(snapshots) != 3 {
t.Errorf("Expected 3 snapshots, got %d", len(snapshots))
}
// Verify database node exists and is healthy
if len(engine.Nodes) != 2 {
t.Errorf("Expected 2 nodes (webserver + database), got %d", len(engine.Nodes))
}
dbNode, exists := engine.Nodes["database"]
if !exists {
t.Errorf("Database node should exist in simulation")
}
if !dbNode.Alive {
t.Errorf("Database node should be alive")
}
if dbNode.Type != "database" {
t.Errorf("Expected database type, got %s", dbNode.Type)
}
}
func TestCacheIntegration(t *testing.T) {
design := design.Design{
Nodes: []design.Node{
{
ID: "webserver",
Type: "webserver",
Props: map[string]interface{}{
"rpsCapacity": 10,
},
},
{
ID: "cache",
Type: "cache",
Props: map[string]interface{}{
"cacheTTL": 5000,
"maxEntries": 50,
"evictionPolicy": "LRU",
},
},
{
ID: "database",
Type: "database",
Props: map[string]interface{}{
"replication": 1,
"maxRPS": 100,
"baseLatencyMs": 15,
},
},
},
Connections: []design.Connection{
{
Source: "webserver",
Target: "cache",
},
{
Source: "cache",
Target: "database",
},
},
}
engine := NewEngineFromDesign(design, 100)
engine.RPS = 5
engine.EntryNode = "webserver"
snapshots := engine.Run(5, 100)
if len(snapshots) != 5 {
t.Errorf("Expected 5 snapshots, got %d", len(snapshots))
}
// Verify all nodes exist and are healthy
if len(engine.Nodes) != 3 {
t.Errorf("Expected 3 nodes (webserver + cache + database), got %d", len(engine.Nodes))
}
cacheNode, exists := engine.Nodes["cache"]
if !exists {
t.Errorf("Cache node should exist in simulation")
}
if !cacheNode.Alive {
t.Errorf("Cache node should be alive")
}
if cacheNode.Type != "cache" {
t.Errorf("Expected cache type, got %s", cacheNode.Type)
}
// Verify cache has internal state
cacheData, ok := cacheNode.Props["_cacheData"]
if !ok {
t.Errorf("Cache should have internal _cacheData state")
}
// Cache data should be a map
if _, ok := cacheData.(map[string]*CacheEntry); !ok {
t.Errorf("Cache data should be map[string]*CacheEntry")
}
}
func TestMessageQueueIntegration(t *testing.T) {
design := design.Design{
Nodes: []design.Node{
{
ID: "producer",
Type: "webserver",
Props: map[string]interface{}{
"rpsCapacity": 10,
},
},
{
ID: "messagequeue",
Type: "messageQueue",
Props: map[string]interface{}{
"queueCapacity": 50,
"retentionSeconds": 3600,
"processingRate": 5,
},
},
{
ID: "consumer",
Type: "webserver",
Props: map[string]interface{}{
"rpsCapacity": 20,
},
},
},
Connections: []design.Connection{
{
Source: "producer",
Target: "messagequeue",
},
{
Source: "messagequeue",
Target: "consumer",
},
},
}
engine := NewEngineFromDesign(design, 100)
engine.RPS = 3
engine.EntryNode = "producer"
snapshots := engine.Run(5, 100)
if len(snapshots) != 5 {
t.Errorf("Expected 5 snapshots, got %d", len(snapshots))
}
// Verify all nodes exist and are healthy
if len(engine.Nodes) != 3 {
t.Errorf("Expected 3 nodes (producer + queue + consumer), got %d", len(engine.Nodes))
}
queueNode, exists := engine.Nodes["messagequeue"]
if !exists {
t.Errorf("Message queue node should exist in simulation")
}
if !queueNode.Alive {
t.Errorf("Message queue node should be alive")
}
if queueNode.Type != "messageQueue" {
t.Errorf("Expected messageQueue type, got %s", queueNode.Type)
}
// Verify queue has internal state
messageQueue, ok := queueNode.Props["_messageQueue"]
if !ok {
t.Errorf("Message queue should have internal _messageQueue state")
}
// Message queue should be a slice
if _, ok := messageQueue.([]QueuedMessage); !ok {
t.Errorf("Message queue should be []QueuedMessage")
}
}
func TestMicroserviceIntegration(t *testing.T) {
// Load the microservice design
designData, err := os.ReadFile("testdata/microservice_design.json")
if err != nil {
t.Fatalf("Failed to read microservice design: %v", err)
}
var d design.Design
if err := json.Unmarshal(designData, &d); err != nil {
t.Fatalf("Failed to unmarshal design: %v", err)
}
// Create engine
engine := NewEngineFromDesign(d, 100)
if engine == nil {
t.Fatalf("Failed to create engine from microservice design")
}
// Set up simulation parameters
engine.RPS = 30
engine.EntryNode = "webserver-1"
// Run simulation for 5 ticks
snapshots := engine.Run(5, 100)
if len(snapshots) != 5 {
t.Errorf("Expected 5 snapshots, got %d", len(snapshots))
}
// Verify microservice nodes exist and are configured correctly
userService, exists := engine.Nodes["microservice-1"]
if !exists {
t.Errorf("User service microservice node should exist")
}
if !userService.Alive {
t.Errorf("User service should be alive")
}
if userService.Type != "microservice" {
t.Errorf("Expected microservice type, got %s", userService.Type)
}
orderService, exists := engine.Nodes["microservice-2"]
if !exists {
t.Errorf("Order service microservice node should exist")
}
if !orderService.Alive {
t.Errorf("Order service should be alive")
}
// Verify auto-scaling properties are preserved
userServiceInstanceCount := userService.Props["instanceCount"]
if userServiceInstanceCount == nil {
t.Errorf("User service should have instanceCount property")
}
// Verify different scaling strategies
userScalingStrategy := userService.Props["scalingStrategy"]
if userScalingStrategy != "auto" {
t.Errorf("Expected auto scaling strategy for user service, got %v", userScalingStrategy)
}
orderScalingStrategy := orderService.Props["scalingStrategy"]
if orderScalingStrategy != "manual" {
t.Errorf("Expected manual scaling strategy for order service, got %v", orderScalingStrategy)
}
// Verify resource configurations
userCPU := userService.Props["cpu"]
if userCPU != 4.0 {
t.Errorf("Expected user service to have 4 CPU cores, got %v", userCPU)
}
orderRAM := orderService.Props["ramGb"]
if orderRAM != 4.0 {
t.Errorf("Expected order service to have 4GB RAM, got %v", orderRAM)
}
// Check that microservices processed requests through the simulation
lastSnapshot := snapshots[len(snapshots)-1]
if len(lastSnapshot.QueueSizes) == 0 {
t.Errorf("Expected queue sizes to be tracked in snapshots")
}
// Verify load balancer connected to microservices
loadBalancer, exists := engine.Nodes["lb-1"]
if !exists {
t.Errorf("Load balancer should exist")
}
if !loadBalancer.Alive {
t.Errorf("Load balancer should be alive")
}
// Verify database connection exists
database, exists := engine.Nodes["db-1"]
if !exists {
t.Errorf("Database should exist")
}
if !database.Alive {
t.Errorf("Database should be alive")
}
}
func TestMonitoringIntegration(t *testing.T) {
// Load the monitoring design
designData, err := os.ReadFile("testdata/monitoring_design.json")
if err != nil {
t.Fatalf("Failed to read monitoring design: %v", err)
}
var d design.Design
if err := json.Unmarshal(designData, &d); err != nil {
t.Fatalf("Failed to unmarshal design: %v", err)
}
// Create engine
engine := NewEngineFromDesign(d, 100)
if engine == nil {
t.Fatalf("Failed to create engine from monitoring design")
}
// Set up simulation parameters
engine.RPS = 20
engine.EntryNode = "webserver-1"
// Run simulation for 10 ticks to allow metrics collection
snapshots := engine.Run(10, 100)
if len(snapshots) != 10 {
t.Errorf("Expected 10 snapshots, got %d", len(snapshots))
}
// Verify monitoring nodes exist and are configured correctly
monitor1, exists := engine.Nodes["monitor-1"]
if !exists {
t.Errorf("Latency monitor node should exist")
}
if !monitor1.Alive {
t.Errorf("Latency monitor should be alive")
}
if monitor1.Type != "monitoring/alerting" {
t.Errorf("Expected monitoring/alerting type, got %s", monitor1.Type)
}
monitor2, exists := engine.Nodes["monitor-2"]
if !exists {
t.Errorf("Error rate monitor node should exist")
}
if !monitor2.Alive {
t.Errorf("Error rate monitor should be alive")
}
// Verify monitoring properties are preserved
tool1 := monitor1.Props["tool"]
if tool1 != "Prometheus" {
t.Errorf("Expected Prometheus tool for monitor-1, got %v", tool1)
}
tool2 := monitor2.Props["tool"]
if tool2 != "Datadog" {
t.Errorf("Expected Datadog tool for monitor-2, got %v", tool2)
}
alertMetric1 := monitor1.Props["alertMetric"]
if alertMetric1 != "latency" {
t.Errorf("Expected latency alert metric for monitor-1, got %v", alertMetric1)
}
alertMetric2 := monitor2.Props["alertMetric"]
if alertMetric2 != "error_rate" {
t.Errorf("Expected error_rate alert metric for monitor-2, got %v", alertMetric2)
}
// Check that metrics were collected during simulation
metrics1, ok := monitor1.Props["_metrics"]
if !ok {
t.Errorf("Expected monitor-1 to have collected metrics")
}
if metrics1 == nil {
t.Errorf("Expected monitor-1 metrics to be non-nil")
}
// Check alert count tracking
alertCount1, ok := monitor1.Props["_alertCount"]
if !ok {
t.Errorf("Expected monitor-1 to track alert count")
}
if alertCount1 == nil {
t.Errorf("Expected monitor-1 alert count to be tracked")
}
// Verify other components in the chain
webserver, exists := engine.Nodes["webserver-1"]
if !exists {
t.Errorf("Web server should exist")
}
if !webserver.Alive {
t.Errorf("Web server should be alive")
}
loadBalancer, exists := engine.Nodes["lb-1"]
if !exists {
t.Errorf("Load balancer should exist")
}
if !loadBalancer.Alive {
t.Errorf("Load balancer should be alive")
}
// Verify microservices
userService, exists := engine.Nodes["microservice-1"]
if !exists {
t.Errorf("User service should exist")
}
if !userService.Alive {
t.Errorf("User service should be alive")
}
orderService, exists := engine.Nodes["microservice-2"]
if !exists {
t.Errorf("Order service should exist")
}
if !orderService.Alive {
t.Errorf("Order service should be alive")
}
// Verify database
database, exists := engine.Nodes["db-1"]
if !exists {
t.Errorf("Database should exist")
}
if !database.Alive {
t.Errorf("Database should be alive")
}
// Check that requests flowed through the monitoring chain
lastSnapshot := snapshots[len(snapshots)-1]
if len(lastSnapshot.QueueSizes) == 0 {
t.Errorf("Expected queue sizes to be tracked in snapshots")
}
// Verify monitoring nodes processed requests
if lastSnapshot.NodeHealth["monitor-1"] != true {
t.Errorf("Expected monitor-1 to be healthy in final snapshot")
}
if lastSnapshot.NodeHealth["monitor-2"] != true {
t.Errorf("Expected monitor-2 to be healthy in final snapshot")
}
}
func TestThirdPartyServiceIntegration(t *testing.T) {
// Load the third party service design
designData, err := os.ReadFile("testdata/thirdpartyservice_design.json")
if err != nil {
t.Fatalf("Failed to read third party service design: %v", err)
}
var d design.Design
if err := json.Unmarshal(designData, &d); err != nil {
t.Fatalf("Failed to unmarshal design: %v", err)
}
// Create engine
engine := NewEngineFromDesign(d, 100)
if engine == nil {
t.Fatalf("Failed to create engine from third party service design")
}
// Set up simulation parameters
engine.RPS = 10 // Lower RPS to reduce chance of random failures affecting health
engine.EntryNode = "webserver-1"
// Run simulation for 5 ticks (shorter run to reduce random failure impact)
snapshots := engine.Run(5, 100)
if len(snapshots) != 5 {
t.Errorf("Expected 5 snapshots, got %d", len(snapshots))
}
// Verify third party service nodes exist and are configured correctly
stripeService, exists := engine.Nodes["stripe-service"]
if !exists {
t.Errorf("Stripe service node should exist")
}
if stripeService.Type != "third party service" {
t.Errorf("Expected third party service type, got %s", stripeService.Type)
}
twilioService, exists := engine.Nodes["twilio-service"]
if !exists {
t.Errorf("Twilio service node should exist")
}
sendgridService, exists := engine.Nodes["sendgrid-service"]
if !exists {
t.Errorf("SendGrid service node should exist")
}
slackService, exists := engine.Nodes["slack-service"]
if !exists {
t.Errorf("Slack service node should exist")
}
// Note: We don't check if services are alive here because the random failure
// simulation can cause services to go down, which is realistic behavior
// Verify provider configurations are preserved
stripeProvider := stripeService.Props["provider"]
if stripeProvider != "Stripe" {
t.Errorf("Expected Stripe provider, got %v", stripeProvider)
}
twilioProvider := twilioService.Props["provider"]
if twilioProvider != "Twilio" {
t.Errorf("Expected Twilio provider, got %v", twilioProvider)
}
sendgridProvider := sendgridService.Props["provider"]
if sendgridProvider != "SendGrid" {
t.Errorf("Expected SendGrid provider, got %v", sendgridProvider)
}
slackProvider := slackService.Props["provider"]
if slackProvider != "Slack" {
t.Errorf("Expected Slack provider, got %v", slackProvider)
}
// Verify latency configurations
stripeLatency := stripeService.Props["latency"]
if stripeLatency != 180.0 {
t.Errorf("Expected Stripe latency 180, got %v", stripeLatency)
}
twilioLatency := twilioService.Props["latency"]
if twilioLatency != 250.0 {
t.Errorf("Expected Twilio latency 250, got %v", twilioLatency)
}
// Check that service status was initialized and tracked
stripeStatus, ok := stripeService.Props["_serviceStatus"]
if !ok {
t.Errorf("Expected Stripe service status to be tracked")
}
if stripeStatus == nil {
t.Errorf("Expected Stripe service status to be non-nil")
}
// Verify other components in the chain
webserver, exists := engine.Nodes["webserver-1"]
if !exists {
t.Errorf("Web server should exist")
}
if !webserver.Alive {
t.Errorf("Web server should be alive")
}
// Verify microservices
paymentService, exists := engine.Nodes["microservice-1"]
if !exists {
t.Errorf("Payment service should exist")
}
if !paymentService.Alive {
t.Errorf("Payment service should be alive")
}
notificationService, exists := engine.Nodes["microservice-2"]
if !exists {
t.Errorf("Notification service should exist")
}
if !notificationService.Alive {
t.Errorf("Notification service should be alive")
}
// Verify monitoring and database
monitor, exists := engine.Nodes["monitor-1"]
if !exists {
t.Errorf("Monitor should exist")
}
if !monitor.Alive {
t.Errorf("Monitor should be alive")
}
database, exists := engine.Nodes["db-1"]
if !exists {
t.Errorf("Database should exist")
}
if !database.Alive {
t.Errorf("Database should be alive")
}
// Check that requests flowed through the third party services
lastSnapshot := snapshots[len(snapshots)-1]
if len(lastSnapshot.QueueSizes) == 0 {
t.Errorf("Expected queue sizes to be tracked in snapshots")
}
// Verify third party services are being tracked in snapshots
// Note: We don't assert health status because random failures are realistic
_, stripeHealthTracked := lastSnapshot.NodeHealth["stripe-service"]
if !stripeHealthTracked {
t.Errorf("Expected Stripe service health to be tracked in snapshots")
}
_, twilioHealthTracked := lastSnapshot.NodeHealth["twilio-service"]
if !twilioHealthTracked {
t.Errorf("Expected Twilio service health to be tracked in snapshots")
}
_, sendgridHealthTracked := lastSnapshot.NodeHealth["sendgrid-service"]
if !sendgridHealthTracked {
t.Errorf("Expected SendGrid service health to be tracked in snapshots")
}
_, slackHealthTracked := lastSnapshot.NodeHealth["slack-service"]
if !slackHealthTracked {
t.Errorf("Expected Slack service health to be tracked in snapshots")
}
}
func TestDataPipelineIntegration(t *testing.T) {
// Load the data pipeline design
designData, err := os.ReadFile("testdata/datapipeline_design.json")
if err != nil {
t.Fatalf("Failed to read data pipeline design: %v", err)
}
var d design.Design
if err := json.Unmarshal(designData, &d); err != nil {
t.Fatalf("Failed to unmarshal design: %v", err)
}
// Create engine
engine := NewEngineFromDesign(d, 100)
if engine == nil {
t.Fatalf("Failed to create engine from data pipeline design")
}
// Set up simulation parameters
engine.RPS = 20
engine.EntryNode = "data-source"
// Run simulation for 10 ticks to test data pipeline processing
snapshots := engine.Run(10, 100)
if len(snapshots) != 10 {
t.Errorf("Expected 10 snapshots, got %d", len(snapshots))
}
// Verify data pipeline nodes exist and are configured correctly
etlPipeline1, exists := engine.Nodes["etl-pipeline-1"]
if !exists {
t.Errorf("ETL Pipeline 1 node should exist")
}
if etlPipeline1.Type != "data pipeline" {
t.Errorf("Expected data pipeline type, got %s", etlPipeline1.Type)
}
etlPipeline2, exists := engine.Nodes["etl-pipeline-2"]
if !exists {
t.Errorf("ETL Pipeline 2 node should exist")
}
mlPipeline, exists := engine.Nodes["ml-pipeline"]
if !exists {
t.Errorf("ML Pipeline node should exist")
}
analyticsPipeline, exists := engine.Nodes["analytics-pipeline"]
if !exists {
t.Errorf("Analytics Pipeline node should exist")
}
compressionPipeline, exists := engine.Nodes["compression-pipeline"]
if !exists {
t.Errorf("Compression Pipeline node should exist")
}
// Verify pipeline configurations are preserved
etl1BatchSize := etlPipeline1.Props["batchSize"]
if etl1BatchSize != 100.0 {
t.Errorf("Expected ETL Pipeline 1 batch size 100, got %v", etl1BatchSize)
}
etl1Transformation := etlPipeline1.Props["transformation"]
if etl1Transformation != "validate" {
t.Errorf("Expected validate transformation, got %v", etl1Transformation)
}
etl2BatchSize := etlPipeline2.Props["batchSize"]
if etl2BatchSize != 50.0 {
t.Errorf("Expected ETL Pipeline 2 batch size 50, got %v", etl2BatchSize)
}
etl2Transformation := etlPipeline2.Props["transformation"]
if etl2Transformation != "aggregate" {
t.Errorf("Expected aggregate transformation, got %v", etl2Transformation)
}
mlTransformation := mlPipeline.Props["transformation"]
if mlTransformation != "enrich" {
t.Errorf("Expected enrich transformation for ML pipeline, got %v", mlTransformation)
}
analyticsTransformation := analyticsPipeline.Props["transformation"]
if analyticsTransformation != "join" {
t.Errorf("Expected join transformation for analytics pipeline, got %v", analyticsTransformation)
}
compressionTransformation := compressionPipeline.Props["transformation"]
if compressionTransformation != "compress" {
t.Errorf("Expected compress transformation, got %v", compressionTransformation)
}
// Check that pipeline state was initialized and tracked
etl1State, ok := etlPipeline1.Props["_pipelineState"]
if !ok {
t.Errorf("Expected ETL Pipeline 1 to have pipeline state")
}
if etl1State == nil {
t.Errorf("Expected ETL Pipeline 1 state to be non-nil")
}
// Verify other components in the data flow
dataSource, exists := engine.Nodes["data-source"]
if !exists {
t.Errorf("Data source should exist")
}
if !dataSource.Alive {
t.Errorf("Data source should be alive")
}
rawDataQueue, exists := engine.Nodes["raw-data-queue"]
if !exists {
t.Errorf("Raw data queue should exist")
}
if !rawDataQueue.Alive {
t.Errorf("Raw data queue should be alive")
}
// Verify storage components
cache, exists := engine.Nodes["cache-1"]
if !exists {
t.Errorf("Feature cache should exist")
}
if !cache.Alive {
t.Errorf("Feature cache should be alive")
}
dataWarehouse, exists := engine.Nodes["data-warehouse"]
if !exists {
t.Errorf("Data warehouse should exist")
}
if !dataWarehouse.Alive {
t.Errorf("Data warehouse should be alive")
}
// Verify monitoring
monitor, exists := engine.Nodes["monitoring-1"]
if !exists {
t.Errorf("Pipeline monitor should exist")
}
if !monitor.Alive {
t.Errorf("Pipeline monitor should be alive")
}
// Check that data pipelines are being tracked in snapshots
lastSnapshot := snapshots[len(snapshots)-1]
if len(lastSnapshot.QueueSizes) == 0 {
t.Errorf("Expected queue sizes to be tracked in snapshots")
}
// Verify data pipeline health is tracked
_, etl1HealthTracked := lastSnapshot.NodeHealth["etl-pipeline-1"]
if !etl1HealthTracked {
t.Errorf("Expected ETL Pipeline 1 health to be tracked in snapshots")
}
_, etl2HealthTracked := lastSnapshot.NodeHealth["etl-pipeline-2"]
if !etl2HealthTracked {
t.Errorf("Expected ETL Pipeline 2 health to be tracked in snapshots")
}
_, mlHealthTracked := lastSnapshot.NodeHealth["ml-pipeline"]
if !mlHealthTracked {
t.Errorf("Expected ML Pipeline health to be tracked in snapshots")
}
_, analyticsHealthTracked := lastSnapshot.NodeHealth["analytics-pipeline"]
if !analyticsHealthTracked {
t.Errorf("Expected Analytics Pipeline health to be tracked in snapshots")
}
_, compressionHealthTracked := lastSnapshot.NodeHealth["compression-pipeline"]
if !compressionHealthTracked {
t.Errorf("Expected Compression Pipeline health to be tracked in snapshots")
}
// Verify the data flow chain exists (all components are connected)
// This ensures the integration test validates the complete data processing architecture
totalNodes := len(engine.Nodes)
expectedNodes := 10 // From the design JSON
if totalNodes != expectedNodes {
t.Errorf("Expected %d total nodes in data pipeline architecture, got %d", expectedNodes, totalNodes)
}
}

115
internal/simulation/messagequeue.go

@ -0,0 +1,115 @@ @@ -0,0 +1,115 @@
package simulation
type MessageQueueLogic struct{}
type QueuedMessage struct {
RequestID string
Timestamp int
MessageData string
RetryCount int
}
func (mq MessageQueueLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
// Extract message queue properties
queueCapacity := int(AsFloat64(props["queueCapacity"]))
if queueCapacity == 0 {
queueCapacity = 1000 // default capacity
}
retentionSeconds := int(AsFloat64(props["retentionSeconds"]))
if retentionSeconds == 0 {
retentionSeconds = 86400 // default 24 hours in seconds
}
// Processing rate (messages per tick)
processingRate := int(AsFloat64(props["processingRate"]))
if processingRate == 0 {
processingRate = 100 // default 100 messages per tick
}
// Current timestamp for this tick
currentTime := tick * 100 // assuming 100ms per tick
// Initialize queue storage in props
messageQueue, ok := props["_messageQueue"].([]QueuedMessage)
if !ok {
messageQueue = []QueuedMessage{}
}
// Clean up expired messages based on retention policy
messageQueue = mq.cleanExpiredMessages(messageQueue, currentTime, retentionSeconds*1000)
// First, process existing messages from the queue (FIFO order)
output := []*Request{}
messagesToProcess := len(messageQueue)
if messagesToProcess > processingRate {
messagesToProcess = processingRate
}
for i := 0; i < messagesToProcess; i++ {
if len(messageQueue) == 0 {
break
}
// Dequeue message (FIFO - take from front)
message := messageQueue[0]
messageQueue = messageQueue[1:]
// Create request for downstream processing
processedReq := &Request{
ID: message.RequestID,
Timestamp: message.Timestamp,
LatencyMS: 2, // Small latency for queue processing
Origin: "message-queue",
Type: "PROCESS",
Path: []string{"queued-message"},
}
output = append(output, processedReq)
}
// Then, add incoming requests to the queue for next tick
for _, req := range queue {
// Check if queue is at capacity
if len(messageQueue) >= queueCapacity {
// Queue full - message is dropped (or could implement backpressure)
// For now, we'll drop the message and add latency penalty
reqCopy := *req
reqCopy.LatencyMS += 1000 // High latency penalty for dropped messages
reqCopy.Path = append(reqCopy.Path, "queue-full-dropped")
// Don't add to output as message was dropped
continue
}
// Add message to queue
message := QueuedMessage{
RequestID: req.ID,
Timestamp: currentTime,
MessageData: "message-payload", // In real system, this would be the actual message
RetryCount: 0,
}
messageQueue = append(messageQueue, message)
}
// Update queue storage in props
props["_messageQueue"] = messageQueue
// Queue is healthy if not at capacity or if we can still process messages
// Queue becomes unhealthy only when completely full AND we can't process anything
healthy := len(messageQueue) < queueCapacity || processingRate > 0
return output, healthy
}
func (mq MessageQueueLogic) cleanExpiredMessages(messageQueue []QueuedMessage, currentTime, retentionMs int) []QueuedMessage {
cleaned := []QueuedMessage{}
for _, message := range messageQueue {
if (currentTime - message.Timestamp) <= retentionMs {
cleaned = append(cleaned, message)
}
// Expired messages are dropped
}
return cleaned
}

329
internal/simulation/messagequeue_test.go

@ -0,0 +1,329 @@ @@ -0,0 +1,329 @@
package simulation
import (
"testing"
)
func TestMessageQueueLogic_BasicProcessing(t *testing.T) {
mq := MessageQueueLogic{}
props := map[string]any{
"queueCapacity": 10,
"retentionSeconds": 3600, // 1 hour
"processingRate": 5,
}
// Add some messages to the queue
reqs := []*Request{
{ID: "msg1", Type: "SEND", LatencyMS: 0, Timestamp: 100},
{ID: "msg2", Type: "SEND", LatencyMS: 0, Timestamp: 100},
{ID: "msg3", Type: "SEND", LatencyMS: 0, Timestamp: 100},
}
output, healthy := mq.Tick(props, reqs, 1)
if !healthy {
t.Errorf("Message queue should be healthy")
}
// No immediate output since messages are queued first
if len(output) != 0 {
t.Errorf("Expected 0 immediate output (messages queued), got %d", len(output))
}
// Check that messages are in the queue
messageQueue, ok := props["_messageQueue"].([]QueuedMessage)
if !ok {
t.Errorf("Expected message queue to be initialized")
}
if len(messageQueue) != 3 {
t.Errorf("Expected 3 messages in queue, got %d", len(messageQueue))
}
// Process the queue (no new incoming messages)
output2, _ := mq.Tick(props, []*Request{}, 2)
// Should process up to processingRate (5) messages
if len(output2) != 3 {
t.Errorf("Expected 3 processed messages, got %d", len(output2))
}
// Queue should now be empty
messageQueue2, _ := props["_messageQueue"].([]QueuedMessage)
if len(messageQueue2) != 0 {
t.Errorf("Expected empty queue after processing, got %d messages", len(messageQueue2))
}
// Check output message properties
for _, msg := range output2 {
if msg.LatencyMS != 2 {
t.Errorf("Expected 2ms processing latency, got %dms", msg.LatencyMS)
}
if msg.Type != "PROCESS" {
t.Errorf("Expected PROCESS type, got %s", msg.Type)
}
}
}
func TestMessageQueueLogic_CapacityLimit(t *testing.T) {
mq := MessageQueueLogic{}
props := map[string]any{
"queueCapacity": 2, // Small capacity
"retentionSeconds": 3600,
"processingRate": 1,
}
// Add more messages than capacity
reqs := []*Request{
{ID: "msg1", Type: "SEND", LatencyMS: 0},
{ID: "msg2", Type: "SEND", LatencyMS: 0},
{ID: "msg3", Type: "SEND", LatencyMS: 0}, // This should be dropped
}
output, healthy := mq.Tick(props, reqs, 1)
// Queue should be healthy (can still process messages)
if !healthy {
t.Errorf("Queue should be healthy (can still process)")
}
// Should have no immediate output (messages queued)
if len(output) != 0 {
t.Errorf("Expected 0 immediate output, got %d", len(output))
}
// Check queue size
messageQueue, _ := props["_messageQueue"].([]QueuedMessage)
if len(messageQueue) != 2 {
t.Errorf("Expected 2 messages in queue (capacity limit), got %d", len(messageQueue))
}
// Add another message when queue is full
reqs2 := []*Request{{ID: "msg4", Type: "SEND", LatencyMS: 0}}
output2, healthy2 := mq.Tick(props, reqs2, 2)
// Queue should still be healthy (can process messages)
if !healthy2 {
t.Errorf("Queue should remain healthy (can still process)")
}
// Should have 1 processed message (processingRate = 1)
if len(output2) != 1 {
t.Errorf("Expected 1 processed message, got %d", len(output2))
}
// Queue should have 2 messages (started with 2, processed 1 leaving 1, added 1 new since space available)
messageQueue2, _ := props["_messageQueue"].([]QueuedMessage)
if len(messageQueue2) != 2 {
t.Errorf("Expected 2 messages in queue (1 remaining + 1 new), got %d", len(messageQueue2))
}
}
func TestMessageQueueLogic_ProcessingRate(t *testing.T) {
mq := MessageQueueLogic{}
props := map[string]any{
"queueCapacity": 100,
"retentionSeconds": 3600,
"processingRate": 3, // Process 3 messages per tick
}
// Add 10 messages
reqs := []*Request{}
for i := 0; i < 10; i++ {
reqs = append(reqs, &Request{ID: "msg" + string(rune(i+'0')), Type: "SEND"})
}
// First tick: queue all messages
mq.Tick(props, reqs, 1)
// Second tick: process at rate limit
output, _ := mq.Tick(props, []*Request{}, 2)
if len(output) != 3 {
t.Errorf("Expected 3 processed messages (rate limit), got %d", len(output))
}
// Check remaining queue size
messageQueue, _ := props["_messageQueue"].([]QueuedMessage)
if len(messageQueue) != 7 {
t.Errorf("Expected 7 messages remaining in queue, got %d", len(messageQueue))
}
// Third tick: process 3 more
output2, _ := mq.Tick(props, []*Request{}, 3)
if len(output2) != 3 {
t.Errorf("Expected 3 more processed messages, got %d", len(output2))
}
// Check remaining queue size
messageQueue2, _ := props["_messageQueue"].([]QueuedMessage)
if len(messageQueue2) != 4 {
t.Errorf("Expected 4 messages remaining in queue, got %d", len(messageQueue2))
}
}
func TestMessageQueueLogic_MessageRetention(t *testing.T) {
mq := MessageQueueLogic{}
props := map[string]any{
"queueCapacity": 100,
"retentionSeconds": 1, // 1 second retention
"processingRate": 0, // Don't process messages, just test retention
}
// Add messages at tick 1
reqs := []*Request{
{ID: "msg1", Type: "SEND", Timestamp: 100},
{ID: "msg2", Type: "SEND", Timestamp: 100},
}
mq.Tick(props, reqs, 1)
// Check messages are queued
messageQueue, _ := props["_messageQueue"].([]QueuedMessage)
if len(messageQueue) != 2 {
t.Errorf("Expected 2 messages in queue, got %d", len(messageQueue))
}
// Tick at time that should expire messages (tick 20 = 2000ms, retention = 1000ms)
output, _ := mq.Tick(props, []*Request{}, 20)
// Messages should be expired and removed
messageQueue2, _ := props["_messageQueue"].([]QueuedMessage)
if len(messageQueue2) != 0 {
t.Errorf("Expected messages to be expired and removed, got %d", len(messageQueue2))
}
// No output since processingRate = 0
if len(output) != 0 {
t.Errorf("Expected no output with processingRate=0, got %d", len(output))
}
}
func TestMessageQueueLogic_FIFOOrdering(t *testing.T) {
mq := MessageQueueLogic{}
props := map[string]any{
"queueCapacity": 10,
"retentionSeconds": 3600,
"processingRate": 2,
}
// Add messages in order
reqs := []*Request{
{ID: "first", Type: "SEND"},
{ID: "second", Type: "SEND"},
{ID: "third", Type: "SEND"},
}
mq.Tick(props, reqs, 1)
// Process 2 messages
output, _ := mq.Tick(props, []*Request{}, 2)
if len(output) != 2 {
t.Errorf("Expected 2 processed messages, got %d", len(output))
}
// Check FIFO order
if output[0].ID != "first" {
t.Errorf("Expected first message to be 'first', got '%s'", output[0].ID)
}
if output[1].ID != "second" {
t.Errorf("Expected second message to be 'second', got '%s'", output[1].ID)
}
// Process remaining message
output2, _ := mq.Tick(props, []*Request{}, 3)
if len(output2) != 1 {
t.Errorf("Expected 1 remaining message, got %d", len(output2))
}
if output2[0].ID != "third" {
t.Errorf("Expected remaining message to be 'third', got '%s'", output2[0].ID)
}
}
func TestMessageQueueLogic_DefaultValues(t *testing.T) {
mq := MessageQueueLogic{}
// Empty props should use defaults
props := map[string]any{}
reqs := []*Request{{ID: "msg1", Type: "SEND"}}
output, healthy := mq.Tick(props, reqs, 1)
if !healthy {
t.Errorf("Queue should be healthy with default values")
}
// Should queue the message (no immediate output)
if len(output) != 0 {
t.Errorf("Expected message to be queued (0 output), got %d", len(output))
}
// Check that message was queued with defaults
messageQueue, _ := props["_messageQueue"].([]QueuedMessage)
if len(messageQueue) != 1 {
t.Errorf("Expected 1 message queued with defaults, got %d", len(messageQueue))
}
// Process with defaults (should process up to default rate)
output2, _ := mq.Tick(props, []*Request{}, 2)
if len(output2) != 1 {
t.Errorf("Expected 1 processed message with defaults, got %d", len(output2))
}
}
func TestMessageQueueLogic_ContinuousFlow(t *testing.T) {
mq := MessageQueueLogic{}
props := map[string]any{
"queueCapacity": 5,
"retentionSeconds": 3600,
"processingRate": 2,
}
// Tick 1: Add 3 messages
reqs1 := []*Request{
{ID: "msg1", Type: "SEND"},
{ID: "msg2", Type: "SEND"},
{ID: "msg3", Type: "SEND"},
}
output1, _ := mq.Tick(props, reqs1, 1)
// Should queue all 3 messages
if len(output1) != 0 {
t.Errorf("Expected 0 output on first tick, got %d", len(output1))
}
// Tick 2: Add 2 more messages, process 2
reqs2 := []*Request{
{ID: "msg4", Type: "SEND"},
{ID: "msg5", Type: "SEND"},
}
output2, _ := mq.Tick(props, reqs2, 2)
// Should process 2 messages
if len(output2) != 2 {
t.Errorf("Expected 2 processed messages, got %d", len(output2))
}
// Should have 3 messages in queue (3 remaining + 2 new - 2 processed)
messageQueue, _ := props["_messageQueue"].([]QueuedMessage)
if len(messageQueue) != 3 {
t.Errorf("Expected 3 messages in queue, got %d", len(messageQueue))
}
// Check processing order
if output2[0].ID != "msg1" || output2[1].ID != "msg2" {
t.Errorf("Expected FIFO processing order, got %s, %s", output2[0].ID, output2[1].ID)
}
}

162
internal/simulation/microservice.go

@ -0,0 +1,162 @@ @@ -0,0 +1,162 @@
package simulation
import "math"
type MicroserviceLogic struct{}
type ServiceInstance struct {
ID int
CurrentLoad int
HealthStatus string
}
func (m MicroserviceLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
// Extract microservice properties
instanceCount := int(AsFloat64(props["instanceCount"]))
if instanceCount == 0 {
instanceCount = 1 // default to 1 instance
}
cpu := int(AsFloat64(props["cpu"]))
if cpu == 0 {
cpu = 2 // default 2 CPU cores
}
ramGb := int(AsFloat64(props["ramGb"]))
if ramGb == 0 {
ramGb = 4 // default 4GB RAM
}
rpsCapacity := int(AsFloat64(props["rpsCapacity"]))
if rpsCapacity == 0 {
rpsCapacity = 100 // default capacity per instance
}
scalingStrategy := AsString(props["scalingStrategy"])
if scalingStrategy == "" {
scalingStrategy = "auto"
}
// Calculate base latency based on resource specs
baseLatencyMs := m.calculateBaseLatency(cpu, ramGb)
// Auto-scaling logic: adjust instance count based on load
currentLoad := len(queue)
if scalingStrategy == "auto" {
instanceCount = m.autoScale(instanceCount, currentLoad, rpsCapacity)
props["instanceCount"] = float64(instanceCount) // update for next tick
}
// Total capacity across all instances
totalCapacity := instanceCount * rpsCapacity
// Process requests up to total capacity
toProcess := queue
if len(queue) > totalCapacity {
toProcess = queue[:totalCapacity]
}
output := []*Request{}
// Distribute requests across instances using round-robin
for i, req := range toProcess {
// Create processed request copy
reqCopy := *req
// Add microservice processing latency
processingLatency := baseLatencyMs
// Simulate CPU-bound vs I/O-bound operations
if req.Type == "GET" {
processingLatency = baseLatencyMs // Fast reads
} else if req.Type == "POST" || req.Type == "PUT" {
processingLatency = baseLatencyMs + 10 // Writes take longer
} else if req.Type == "COMPUTE" {
processingLatency = baseLatencyMs + 50 // CPU-intensive operations
}
// Instance load affects latency (queuing delay)
instanceLoad := m.calculateInstanceLoad(i, len(toProcess), instanceCount)
if float64(instanceLoad) > float64(rpsCapacity)*0.8 { // Above 80% capacity
processingLatency += int(float64(processingLatency) * 0.5) // 50% penalty
}
reqCopy.LatencyMS += processingLatency
reqCopy.Path = append(reqCopy.Path, "microservice-processed")
output = append(output, &reqCopy)
}
// Health check: service is healthy if not severely overloaded
healthy := len(queue) <= totalCapacity*2 // Allow some buffering
return output, healthy
}
// calculateBaseLatency determines base processing time based on resources
func (m MicroserviceLogic) calculateBaseLatency(cpu, ramGb int) int {
// Better CPU and RAM = lower base latency
// Formula: base latency inversely proportional to resources
cpuFactor := float64(cpu)
ramFactor := float64(ramGb) / 4.0 // Normalize to 4GB baseline
resourceScore := cpuFactor * ramFactor
if resourceScore < 1 {
resourceScore = 1
}
baseLatency := int(50.0 / resourceScore) // 50ms baseline for 2CPU/4GB
if baseLatency < 5 {
baseLatency = 5 // Minimum 5ms processing time
}
return baseLatency
}
// autoScale implements simple auto-scaling logic
func (m MicroserviceLogic) autoScale(currentInstances, currentLoad, rpsPerInstance int) int {
// Calculate desired instances based on current load
desiredInstances := int(math.Ceil(float64(currentLoad) / float64(rpsPerInstance)))
// Scale up/down gradually (max 25% change per tick)
maxChange := int(math.Max(1, float64(currentInstances)*0.25))
if desiredInstances > currentInstances {
// Scale up
newInstances := currentInstances + maxChange
if newInstances > desiredInstances {
newInstances = desiredInstances
}
// Cap at reasonable maximum
if newInstances > 20 {
newInstances = 20
}
return newInstances
} else if desiredInstances < currentInstances {
// Scale down (more conservative)
newInstances := currentInstances - int(math.Max(1, float64(maxChange)*0.5))
if newInstances < desiredInstances {
newInstances = desiredInstances
}
// Always maintain at least 1 instance
if newInstances < 1 {
newInstances = 1
}
return newInstances
}
return currentInstances
}
// calculateInstanceLoad estimates load on a specific instance
func (m MicroserviceLogic) calculateInstanceLoad(instanceID, totalRequests, instanceCount int) int {
// Simple round-robin distribution
baseLoad := totalRequests / instanceCount
remainder := totalRequests % instanceCount
if instanceID < remainder {
return baseLoad + 1
}
return baseLoad
}

286
internal/simulation/microservice_test.go

@ -0,0 +1,286 @@ @@ -0,0 +1,286 @@
package simulation
import (
"testing"
)
func TestMicroserviceLogic_BasicProcessing(t *testing.T) {
logic := MicroserviceLogic{}
props := map[string]any{
"instanceCount": 2.0,
"cpu": 4.0,
"ramGb": 8.0,
"rpsCapacity": 100.0,
"scalingStrategy": "manual",
}
requests := []*Request{
{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}},
{ID: "2", Type: "POST", LatencyMS: 0, Path: []string{}},
}
output, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected microservice to be healthy")
}
if len(output) != 2 {
t.Errorf("Expected 2 processed requests, got %d", len(output))
}
// Verify latency was added
for _, req := range output {
if req.LatencyMS == 0 {
t.Error("Expected latency to be added to processed request")
}
if len(req.Path) == 0 || req.Path[len(req.Path)-1] != "microservice-processed" {
t.Error("Expected path to be updated with microservice-processed")
}
}
}
func TestMicroserviceLogic_CapacityLimit(t *testing.T) {
logic := MicroserviceLogic{}
props := map[string]any{
"instanceCount": 1.0,
"rpsCapacity": 2.0,
"scalingStrategy": "manual",
}
// Send 4 requests, capacity is 2 (1 instance * 2 RPS)
// This should be healthy since 4 <= totalCapacity*2 (4)
requests := make([]*Request, 4)
for i := range requests {
requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0}
}
output, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected microservice to be healthy with moderate queuing")
}
// Should only process 2 requests (capacity limit)
if len(output) != 2 {
t.Errorf("Expected 2 processed requests due to capacity limit, got %d", len(output))
}
}
func TestMicroserviceLogic_AutoScaling(t *testing.T) {
logic := MicroserviceLogic{}
props := map[string]any{
"instanceCount": 1.0,
"rpsCapacity": 10.0,
"scalingStrategy": "auto",
}
// Send 25 requests to trigger scaling
requests := make([]*Request, 25)
for i := range requests {
requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0}
}
output, healthy := logic.Tick(props, requests, 1)
// Check if instances were scaled up
newInstanceCount := int(props["instanceCount"].(float64))
if newInstanceCount <= 1 {
t.Error("Expected auto-scaling to increase instance count")
}
// Should process more than 10 requests (original capacity)
if len(output) <= 10 {
t.Errorf("Expected auto-scaling to increase processing capacity, got %d", len(output))
}
if !healthy {
t.Error("Expected microservice to be healthy after scaling")
}
}
func TestMicroserviceLogic_ResourceBasedLatency(t *testing.T) {
logic := MicroserviceLogic{}
// High-resource microservice
highResourceProps := map[string]any{
"instanceCount": 1.0,
"cpu": 8.0,
"ramGb": 16.0,
"rpsCapacity": 100.0,
"scalingStrategy": "manual",
}
// Low-resource microservice
lowResourceProps := map[string]any{
"instanceCount": 1.0,
"cpu": 1.0,
"ramGb": 1.0,
"rpsCapacity": 100.0,
"scalingStrategy": "manual",
}
request := []*Request{{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}}
highOutput, _ := logic.Tick(highResourceProps, request, 1)
lowOutput, _ := logic.Tick(lowResourceProps, request, 1)
highLatency := highOutput[0].LatencyMS
lowLatency := lowOutput[0].LatencyMS
if lowLatency <= highLatency {
t.Errorf("Expected low-resource microservice (%dms) to have higher latency than high-resource (%dms)",
lowLatency, highLatency)
}
}
func TestMicroserviceLogic_RequestTypeLatency(t *testing.T) {
logic := MicroserviceLogic{}
props := map[string]any{
"instanceCount": 1.0,
"cpu": 2.0,
"ramGb": 4.0,
"rpsCapacity": 100.0,
"scalingStrategy": "manual",
}
getRequest := []*Request{{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}}
postRequest := []*Request{{ID: "2", Type: "POST", LatencyMS: 0, Path: []string{}}}
computeRequest := []*Request{{ID: "3", Type: "COMPUTE", LatencyMS: 0, Path: []string{}}}
getOutput, _ := logic.Tick(props, getRequest, 1)
postOutput, _ := logic.Tick(props, postRequest, 1)
computeOutput, _ := logic.Tick(props, computeRequest, 1)
getLatency := getOutput[0].LatencyMS
postLatency := postOutput[0].LatencyMS
computeLatency := computeOutput[0].LatencyMS
if getLatency >= postLatency {
t.Errorf("Expected GET (%dms) to be faster than POST (%dms)", getLatency, postLatency)
}
if postLatency >= computeLatency {
t.Errorf("Expected POST (%dms) to be faster than COMPUTE (%dms)", postLatency, computeLatency)
}
}
func TestMicroserviceLogic_HighLoadLatencyPenalty(t *testing.T) {
logic := MicroserviceLogic{}
props := map[string]any{
"instanceCount": 1.0,
"cpu": 2.0,
"ramGb": 4.0,
"rpsCapacity": 10.0,
"scalingStrategy": "manual",
}
// Low load scenario
lowLoadRequest := []*Request{{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}}
lowOutput, _ := logic.Tick(props, lowLoadRequest, 1)
lowLatency := lowOutput[0].LatencyMS
// High load scenario (above 80% capacity threshold)
highLoadRequests := make([]*Request, 9) // 90% of 10 RPS capacity
for i := range highLoadRequests {
highLoadRequests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0, Path: []string{}}
}
highOutput, _ := logic.Tick(props, highLoadRequests, 1)
// Check if first request has higher latency due to load
highLatency := highOutput[0].LatencyMS
if highLatency <= lowLatency {
t.Errorf("Expected high load scenario (%dms) to have higher latency than low load (%dms)",
highLatency, lowLatency)
}
}
func TestMicroserviceLogic_DefaultValues(t *testing.T) {
logic := MicroserviceLogic{}
// Empty props should use defaults
props := map[string]any{}
requests := []*Request{{ID: "1", Type: "GET", LatencyMS: 0, Path: []string{}}}
output, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected microservice to be healthy with default values")
}
if len(output) != 1 {
t.Errorf("Expected 1 processed request with defaults, got %d", len(output))
}
// Should have reasonable default latency
if output[0].LatencyMS <= 0 || output[0].LatencyMS > 100 {
t.Errorf("Expected reasonable default latency, got %dms", output[0].LatencyMS)
}
}
func TestMicroserviceLogic_UnhealthyWhenOverloaded(t *testing.T) {
logic := MicroserviceLogic{}
props := map[string]any{
"instanceCount": 1.0,
"rpsCapacity": 5.0,
"scalingStrategy": "manual", // No auto-scaling
}
// Send way more requests than capacity (5 * 2 = 10 max before unhealthy)
requests := make([]*Request, 15) // 3x capacity
for i := range requests {
requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0}
}
output, healthy := logic.Tick(props, requests, 1)
if healthy {
t.Error("Expected microservice to be unhealthy when severely overloaded")
}
// Should still process up to capacity
if len(output) != 5 {
t.Errorf("Expected 5 processed requests despite being overloaded, got %d", len(output))
}
}
func TestMicroserviceLogic_RoundRobinDistribution(t *testing.T) {
logic := MicroserviceLogic{}
props := map[string]any{
"instanceCount": 3.0,
"rpsCapacity": 10.0,
"scalingStrategy": "manual",
}
// Send 6 requests to be distributed across 3 instances
requests := make([]*Request, 6)
for i := range requests {
requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 0, Path: []string{}}
}
output, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected microservice to be healthy")
}
if len(output) != 6 {
t.Errorf("Expected 6 processed requests, got %d", len(output))
}
// All requests should be processed (within total capacity of 30)
for _, req := range output {
if req.LatencyMS <= 0 {
t.Error("Expected all requests to have added latency")
}
}
}

221
internal/simulation/monitoring.go

@ -0,0 +1,221 @@ @@ -0,0 +1,221 @@
package simulation
type MonitoringLogic struct{}
type MetricData struct {
Timestamp int
LatencySum int
RequestCount int
ErrorCount int
QueueSize int
}
type AlertEvent struct {
Timestamp int
MetricType string
Value float64
Threshold float64
Unit string
Severity string
}
func (m MonitoringLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
// Extract monitoring properties
tool := AsString(props["tool"])
if tool == "" {
tool = "Prometheus" // default monitoring tool
}
alertMetric := AsString(props["alertMetric"])
if alertMetric == "" {
alertMetric = "latency" // default to latency monitoring
}
thresholdValue := int(AsFloat64(props["thresholdValue"]))
if thresholdValue == 0 {
thresholdValue = 100 // default threshold
}
thresholdUnit := AsString(props["thresholdUnit"])
if thresholdUnit == "" {
thresholdUnit = "ms" // default unit
}
// Get historical metrics from props
metrics, ok := props["_metrics"].([]MetricData)
if !ok {
metrics = []MetricData{}
}
// Get alert history
alerts, ok := props["_alerts"].([]AlertEvent)
if !ok {
alerts = []AlertEvent{}
}
currentTime := tick * 100 // Convert tick to milliseconds
// Process all incoming requests (monitoring is pass-through)
output := []*Request{}
totalLatency := 0
errorCount := 0
for _, req := range queue {
// Create a copy of the request to forward
reqCopy := *req
// Add minimal monitoring overhead (1-2ms for metric collection)
monitoringOverhead := 1
if tool == "Datadog" || tool == "New Relic" {
monitoringOverhead = 2 // More feature-rich tools have slightly higher overhead
}
reqCopy.LatencyMS += monitoringOverhead
reqCopy.Path = append(reqCopy.Path, "monitored")
// Collect metrics from the request
totalLatency += req.LatencyMS
// Simple heuristic: requests with high latency are considered errors
if req.LatencyMS > 1000 { // 1 second threshold for errors
errorCount++
}
output = append(output, &reqCopy)
}
// Calculate current metrics
avgLatency := 0.0
if len(queue) > 0 {
avgLatency = float64(totalLatency) / float64(len(queue))
}
// Store current metrics
currentMetric := MetricData{
Timestamp: currentTime,
LatencySum: totalLatency,
RequestCount: len(queue),
ErrorCount: errorCount,
QueueSize: len(queue),
}
// Add to metrics history (keep last 10 data points)
metrics = append(metrics, currentMetric)
if len(metrics) > 10 {
metrics = metrics[1:]
}
// Check alert conditions
shouldAlert := false
alertValue := 0.0
switch alertMetric {
case "latency":
alertValue = avgLatency
if avgLatency > float64(thresholdValue) && len(queue) > 0 {
shouldAlert = true
}
case "throughput":
alertValue = float64(len(queue))
if len(queue) < thresholdValue { // Low throughput alert
shouldAlert = true
}
case "error_rate":
errorRate := 0.0
if len(queue) > 0 {
errorRate = float64(errorCount) / float64(len(queue)) * 100
}
alertValue = errorRate
if errorRate > float64(thresholdValue) {
shouldAlert = true
}
case "queue_size":
alertValue = float64(len(queue))
if len(queue) > thresholdValue {
shouldAlert = true
}
}
// Generate alert if threshold exceeded
if shouldAlert {
severity := "warning"
if alertValue > float64(thresholdValue)*1.5 { // 150% of threshold
severity = "critical"
}
alert := AlertEvent{
Timestamp: currentTime,
MetricType: alertMetric,
Value: alertValue,
Threshold: float64(thresholdValue),
Unit: thresholdUnit,
Severity: severity,
}
// Only add alert if it's not a duplicate of the last alert
if len(alerts) == 0 || !m.isDuplicateAlert(alerts[len(alerts)-1], alert) {
alerts = append(alerts, alert)
}
// Keep only last 20 alerts
if len(alerts) > 20 {
alerts = alerts[1:]
}
}
// Update props with collected data
props["_metrics"] = metrics
props["_alerts"] = alerts
props["_currentLatency"] = avgLatency
props["_alertCount"] = len(alerts)
// Monitoring system health - it's healthy unless it's completely overloaded
healthy := len(queue) < 10000 // Can handle very high loads
// If too many critical alerts recently, mark as unhealthy
recentCriticalAlerts := 0
for _, alert := range alerts {
if currentTime-alert.Timestamp < 10000 && alert.Severity == "critical" { // Last 10 seconds
recentCriticalAlerts++
}
}
if recentCriticalAlerts > 5 {
healthy = false
}
return output, healthy
}
// isDuplicateAlert checks if an alert is similar to the previous one to avoid spam
func (m MonitoringLogic) isDuplicateAlert(prev, current AlertEvent) bool {
return prev.MetricType == current.MetricType &&
prev.Severity == current.Severity &&
(current.Timestamp-prev.Timestamp) < 5000 // Within 5 seconds
}
// Helper function to calculate moving average
func (m MonitoringLogic) calculateMovingAverage(metrics []MetricData, window int) float64 {
if len(metrics) == 0 {
return 0
}
start := 0
if len(metrics) > window {
start = len(metrics) - window
}
sum := 0.0
count := 0
for i := start; i < len(metrics); i++ {
if metrics[i].RequestCount > 0 {
sum += float64(metrics[i].LatencySum) / float64(metrics[i].RequestCount)
count++
}
}
if count == 0 {
return 0
}
return sum / float64(count)
}

411
internal/simulation/monitoring_test.go

@ -0,0 +1,411 @@ @@ -0,0 +1,411 @@
package simulation
import (
"testing"
)
func TestMonitoringLogic_BasicPassthrough(t *testing.T) {
logic := MonitoringLogic{}
props := map[string]any{
"tool": "Prometheus",
"alertMetric": "latency",
"thresholdValue": 100.0,
"thresholdUnit": "ms",
}
requests := []*Request{
{ID: "1", Type: "GET", LatencyMS: 50, Path: []string{}},
{ID: "2", Type: "POST", LatencyMS: 75, Path: []string{}},
}
output, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected monitoring to be healthy")
}
if len(output) != 2 {
t.Errorf("Expected 2 requests to pass through monitoring, got %d", len(output))
}
// Verify minimal latency overhead was added
for i, req := range output {
originalLatency := requests[i].LatencyMS
if req.LatencyMS <= originalLatency {
t.Errorf("Expected monitoring overhead to be added to latency")
}
if req.LatencyMS > originalLatency+5 {
t.Errorf("Expected minimal monitoring overhead, got %d ms added", req.LatencyMS-originalLatency)
}
if len(req.Path) == 0 || req.Path[len(req.Path)-1] != "monitored" {
t.Error("Expected path to be updated with 'monitored'")
}
}
}
func TestMonitoringLogic_MetricsCollection(t *testing.T) {
logic := MonitoringLogic{}
props := map[string]any{
"tool": "Datadog",
"alertMetric": "latency",
"thresholdValue": 100.0,
"thresholdUnit": "ms",
}
requests := []*Request{
{ID: "1", Type: "GET", LatencyMS: 50},
{ID: "2", Type: "POST", LatencyMS: 150},
{ID: "3", Type: "GET", LatencyMS: 75},
}
_, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected monitoring to be healthy")
}
// Check that metrics were collected
metrics, ok := props["_metrics"].([]MetricData)
if !ok {
t.Error("Expected metrics to be collected in props")
}
if len(metrics) != 1 {
t.Errorf("Expected 1 metric data point, got %d", len(metrics))
}
metric := metrics[0]
if metric.RequestCount != 3 {
t.Errorf("Expected 3 requests counted, got %d", metric.RequestCount)
}
if metric.LatencySum != 275 { // 50 + 150 + 75
t.Errorf("Expected latency sum of 275, got %d", metric.LatencySum)
}
// Check current latency calculation
currentLatency, ok := props["_currentLatency"].(float64)
if !ok {
t.Error("Expected current latency to be calculated")
}
if currentLatency < 90 || currentLatency > 95 {
t.Errorf("Expected average latency around 91.67, got %f", currentLatency)
}
}
func TestMonitoringLogic_LatencyAlert(t *testing.T) {
logic := MonitoringLogic{}
props := map[string]any{
"tool": "Prometheus",
"alertMetric": "latency",
"thresholdValue": 80.0,
"thresholdUnit": "ms",
}
// Send requests that exceed latency threshold
requests := []*Request{
{ID: "1", Type: "GET", LatencyMS: 100},
{ID: "2", Type: "POST", LatencyMS: 120},
}
_, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected monitoring to be healthy despite alerts")
}
// Check that alert was generated
alerts, ok := props["_alerts"].([]AlertEvent)
if !ok {
t.Error("Expected alerts to be stored in props")
}
if len(alerts) != 1 {
t.Errorf("Expected 1 alert to be generated, got %d", len(alerts))
}
alert := alerts[0]
if alert.MetricType != "latency" {
t.Errorf("Expected latency alert, got %s", alert.MetricType)
}
if alert.Threshold != 80.0 {
t.Errorf("Expected threshold of 80, got %f", alert.Threshold)
}
if alert.Value < 80.0 {
t.Errorf("Expected alert value to exceed threshold, got %f", alert.Value)
}
if alert.Severity != "warning" {
t.Errorf("Expected warning severity, got %s", alert.Severity)
}
}
func TestMonitoringLogic_ErrorRateAlert(t *testing.T) {
logic := MonitoringLogic{}
props := map[string]any{
"tool": "Prometheus",
"alertMetric": "error_rate",
"thresholdValue": 20.0, // 20% error rate threshold
"thresholdUnit": "percent",
}
// Send mix of normal and high-latency (error) requests
requests := []*Request{
{ID: "1", Type: "GET", LatencyMS: 100}, // normal
{ID: "2", Type: "POST", LatencyMS: 1200}, // error (>1000ms)
{ID: "3", Type: "GET", LatencyMS: 200}, // normal
{ID: "4", Type: "POST", LatencyMS: 1500}, // error
}
_, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected monitoring to be healthy")
}
// Check that error rate alert was generated (50% error rate > 20% threshold)
alerts, ok := props["_alerts"].([]AlertEvent)
if !ok {
t.Error("Expected alerts to be stored in props")
}
if len(alerts) != 1 {
t.Errorf("Expected 1 alert to be generated, got %d", len(alerts))
}
alert := alerts[0]
if alert.MetricType != "error_rate" {
t.Errorf("Expected error_rate alert, got %s", alert.MetricType)
}
if alert.Value != 50.0 { // 2 errors out of 4 requests = 50%
t.Errorf("Expected 50%% error rate, got %f", alert.Value)
}
}
func TestMonitoringLogic_QueueSizeAlert(t *testing.T) {
logic := MonitoringLogic{}
props := map[string]any{
"tool": "Prometheus",
"alertMetric": "queue_size",
"thresholdValue": 5.0,
"thresholdUnit": "requests",
}
// Send more requests than threshold
requests := make([]*Request, 8)
for i := range requests {
requests[i] = &Request{ID: string(rune('1' + i)), Type: "GET", LatencyMS: 50}
}
_, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected monitoring to be healthy with queue size alert")
}
// Check that queue size alert was generated
alerts, ok := props["_alerts"].([]AlertEvent)
if !ok {
t.Error("Expected alerts to be stored in props")
}
if len(alerts) != 1 {
t.Errorf("Expected 1 alert to be generated, got %d", len(alerts))
}
alert := alerts[0]
if alert.MetricType != "queue_size" {
t.Errorf("Expected queue_size alert, got %s", alert.MetricType)
}
if alert.Value != 8.0 {
t.Errorf("Expected queue size of 8, got %f", alert.Value)
}
}
func TestMonitoringLogic_CriticalAlert(t *testing.T) {
logic := MonitoringLogic{}
props := map[string]any{
"tool": "Prometheus",
"alertMetric": "latency",
"thresholdValue": 100.0,
"thresholdUnit": "ms",
}
// Send requests with very high latency (150% of threshold)
requests := []*Request{
{ID: "1", Type: "GET", LatencyMS: 180}, // 180 > 150 (1.5 * 100)
{ID: "2", Type: "POST", LatencyMS: 200},
}
_, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected monitoring to be healthy")
}
alerts, ok := props["_alerts"].([]AlertEvent)
if !ok {
t.Error("Expected alerts to be stored in props")
}
if len(alerts) != 1 {
t.Errorf("Expected 1 alert to be generated, got %d", len(alerts))
}
alert := alerts[0]
if alert.Severity != "critical" {
t.Errorf("Expected critical severity for high threshold breach, got %s", alert.Severity)
}
}
func TestMonitoringLogic_DuplicateAlertSuppression(t *testing.T) {
logic := MonitoringLogic{}
props := map[string]any{
"tool": "Prometheus",
"alertMetric": "latency",
"thresholdValue": 80.0,
"thresholdUnit": "ms",
}
requests := []*Request{
{ID: "1", Type: "GET", LatencyMS: 100},
}
// First tick - should generate alert
logic.Tick(props, requests, 1)
alerts, _ := props["_alerts"].([]AlertEvent)
if len(alerts) != 1 {
t.Errorf("Expected 1 alert after first tick, got %d", len(alerts))
}
// Second tick immediately after - should suppress duplicate
logic.Tick(props, requests, 2)
alerts, _ = props["_alerts"].([]AlertEvent)
if len(alerts) != 1 {
t.Errorf("Expected duplicate alert to be suppressed, got %d alerts", len(alerts))
}
}
func TestMonitoringLogic_DefaultValues(t *testing.T) {
logic := MonitoringLogic{}
// Empty props should use defaults
props := map[string]any{}
requests := []*Request{{ID: "1", Type: "GET", LatencyMS: 50, Path: []string{}}}
output, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected monitoring to be healthy with default values")
}
if len(output) != 1 {
t.Errorf("Expected 1 request to pass through, got %d", len(output))
}
// Should have reasonable default monitoring overhead
if output[0].LatencyMS <= 50 || output[0].LatencyMS > 55 {
t.Errorf("Expected default monitoring overhead, got %dms total", output[0].LatencyMS)
}
}
func TestMonitoringLogic_ToolSpecificOverhead(t *testing.T) {
logic := MonitoringLogic{}
// Test Prometheus (lower overhead)
propsPrometheus := map[string]any{
"tool": "Prometheus",
}
// Test Datadog (higher overhead)
propsDatadog := map[string]any{
"tool": "Datadog",
}
request := []*Request{{ID: "1", Type: "GET", LatencyMS: 50, Path: []string{}}}
prometheusOutput, _ := logic.Tick(propsPrometheus, request, 1)
datadogOutput, _ := logic.Tick(propsDatadog, request, 1)
prometheusOverhead := prometheusOutput[0].LatencyMS - 50
datadogOverhead := datadogOutput[0].LatencyMS - 50
if datadogOverhead <= prometheusOverhead {
t.Errorf("Expected Datadog (%dms) to have higher overhead than Prometheus (%dms)",
datadogOverhead, prometheusOverhead)
}
}
func TestMonitoringLogic_UnhealthyWithManyAlerts(t *testing.T) {
logic := MonitoringLogic{}
props := map[string]any{
"tool": "Prometheus",
"alertMetric": "latency",
"thresholdValue": 50.0,
"thresholdUnit": "ms",
}
// Manually create many recent critical alerts to simulate an unhealthy state
currentTime := 10000 // 10 seconds
recentAlerts := []AlertEvent{
{Timestamp: currentTime - 1000, MetricType: "latency", Severity: "critical", Value: 200},
{Timestamp: currentTime - 2000, MetricType: "latency", Severity: "critical", Value: 180},
{Timestamp: currentTime - 3000, MetricType: "latency", Severity: "critical", Value: 190},
{Timestamp: currentTime - 4000, MetricType: "latency", Severity: "critical", Value: 170},
{Timestamp: currentTime - 5000, MetricType: "latency", Severity: "critical", Value: 160},
{Timestamp: currentTime - 6000, MetricType: "latency", Severity: "critical", Value: 150},
}
// Set up the props with existing critical alerts
props["_alerts"] = recentAlerts
// Make a request that would trigger another alert (low latency to avoid triggering new alert)
requests := []*Request{{ID: "1", Type: "GET", LatencyMS: 40}}
// This tick should recognize the existing critical alerts and mark system as unhealthy
_, healthy := logic.Tick(props, requests, 100) // tick 100 = 10000ms
if healthy {
t.Error("Expected monitoring to be unhealthy due to many recent critical alerts")
}
}
func TestMonitoringLogic_MetricsHistoryLimit(t *testing.T) {
logic := MonitoringLogic{}
props := map[string]any{
"tool": "Prometheus",
}
request := []*Request{{ID: "1", Type: "GET", LatencyMS: 50}}
// Generate more than 10 metric data points
for i := 0; i < 15; i++ {
logic.Tick(props, request, i)
}
metrics, ok := props["_metrics"].([]MetricData)
if !ok {
t.Error("Expected metrics to be stored")
}
if len(metrics) != 10 {
t.Errorf("Expected metrics history to be limited to 10, got %d", len(metrics))
}
}

55
internal/simulation/testdata/cache_design.json vendored

@ -0,0 +1,55 @@ @@ -0,0 +1,55 @@
{
"nodes": [
{
"id": "webserver",
"type": "webserver",
"position": { "x": 0, "y": 0 },
"props": {
"label": "Web Server",
"rpsCapacity": 100
}
},
{
"id": "cache",
"type": "cache",
"position": { "x": 100, "y": 0 },
"props": {
"label": "Redis Cache",
"cacheTTL": 300000,
"maxEntries": 1000,
"evictionPolicy": "LRU"
}
},
{
"id": "database",
"type": "database",
"position": { "x": 200, "y": 0 },
"props": {
"label": "Primary DB",
"replication": 2,
"maxRPS": 500,
"baseLatencyMs": 20
}
}
],
"connections": [
{
"source": "webserver",
"target": "cache",
"label": "Cache Lookup",
"direction": "forward",
"protocol": "Redis",
"tls": false,
"capacity": 1000
},
{
"source": "cache",
"target": "database",
"label": "Cache Miss",
"direction": "forward",
"protocol": "TCP",
"tls": true,
"capacity": 1000
}
]
}

35
internal/simulation/testdata/database_design.json vendored

@ -0,0 +1,35 @@ @@ -0,0 +1,35 @@
{
"nodes": [
{
"id": "webserver",
"type": "webserver",
"position": { "x": 0, "y": 0 },
"props": {
"label": "Web Server",
"rpsCapacity": 100
}
},
{
"id": "database",
"type": "database",
"position": { "x": 100, "y": 0 },
"props": {
"label": "Primary DB",
"replication": 2,
"maxRPS": 500,
"baseLatencyMs": 15
}
}
],
"connections": [
{
"source": "webserver",
"target": "database",
"label": "DB Queries",
"direction": "forward",
"protocol": "TCP",
"tls": true,
"capacity": 1000
}
]
}

188
internal/simulation/testdata/datapipeline_design.json vendored

@ -0,0 +1,188 @@ @@ -0,0 +1,188 @@
{
"nodes": [
{
"id": "data-source",
"type": "webserver",
"position": { "x": 100, "y": 200 },
"props": {
"label": "Data Ingestion API",
"rpsCapacity": 500
}
},
{
"id": "raw-data-queue",
"type": "messageQueue",
"position": { "x": 300, "y": 200 },
"props": {
"label": "Raw Data Queue",
"queueCapacity": 10000,
"retentionSeconds": 3600,
"processingRate": 200
}
},
{
"id": "etl-pipeline-1",
"type": "data pipeline",
"position": { "x": 500, "y": 150 },
"props": {
"label": "Data Cleansing Pipeline",
"batchSize": 100,
"transformation": "validate"
}
},
{
"id": "etl-pipeline-2",
"type": "data pipeline",
"position": { "x": 500, "y": 250 },
"props": {
"label": "Data Transformation Pipeline",
"batchSize": 50,
"transformation": "aggregate"
}
},
{
"id": "ml-pipeline",
"type": "data pipeline",
"position": { "x": 700, "y": 150 },
"props": {
"label": "ML Feature Pipeline",
"batchSize": 200,
"transformation": "enrich"
}
},
{
"id": "analytics-pipeline",
"type": "data pipeline",
"position": { "x": 700, "y": 250 },
"props": {
"label": "Analytics Pipeline",
"batchSize": 500,
"transformation": "join"
}
},
{
"id": "cache-1",
"type": "cache",
"position": { "x": 900, "y": 150 },
"props": {
"label": "Feature Cache",
"cacheTTL": 300,
"maxEntries": 50000,
"evictionPolicy": "LRU"
}
},
{
"id": "data-warehouse",
"type": "database",
"position": { "x": 900, "y": 250 },
"props": {
"label": "Data Warehouse",
"replication": 3,
"maxRPS": 1000,
"baseLatencyMs": 50
}
},
{
"id": "monitoring-1",
"type": "monitoring/alerting",
"position": { "x": 500, "y": 350 },
"props": {
"label": "Pipeline Monitor",
"tool": "Datadog",
"alertMetric": "latency",
"thresholdValue": 1000,
"thresholdUnit": "ms"
}
},
{
"id": "compression-pipeline",
"type": "data pipeline",
"position": { "x": 300, "y": 350 },
"props": {
"label": "Data Compression",
"batchSize": 1000,
"transformation": "compress"
}
}
],
"connections": [
{
"source": "data-source",
"target": "raw-data-queue",
"label": "Raw Data Stream",
"protocol": "http"
},
{
"source": "raw-data-queue",
"target": "etl-pipeline-1",
"label": "Data Validation",
"protocol": "tcp"
},
{
"source": "raw-data-queue",
"target": "etl-pipeline-2",
"label": "Data Transformation",
"protocol": "tcp"
},
{
"source": "etl-pipeline-1",
"target": "ml-pipeline",
"label": "Clean Data",
"protocol": "tcp"
},
{
"source": "etl-pipeline-2",
"target": "analytics-pipeline",
"label": "Transformed Data",
"protocol": "tcp"
},
{
"source": "ml-pipeline",
"target": "cache-1",
"label": "ML Features",
"protocol": "tcp"
},
{
"source": "analytics-pipeline",
"target": "data-warehouse",
"label": "Analytics Data",
"protocol": "tcp"
},
{
"source": "etl-pipeline-1",
"target": "monitoring-1",
"label": "Pipeline Metrics",
"protocol": "http"
},
{
"source": "etl-pipeline-2",
"target": "monitoring-1",
"label": "Pipeline Metrics",
"protocol": "http"
},
{
"source": "ml-pipeline",
"target": "monitoring-1",
"label": "Pipeline Metrics",
"protocol": "http"
},
{
"source": "analytics-pipeline",
"target": "monitoring-1",
"label": "Pipeline Metrics",
"protocol": "http"
},
{
"source": "raw-data-queue",
"target": "compression-pipeline",
"label": "Archive Stream",
"protocol": "tcp"
},
{
"source": "compression-pipeline",
"target": "data-warehouse",
"label": "Compressed Archive",
"protocol": "tcp"
}
]
}

53
internal/simulation/testdata/messagequeue_design.json vendored

@ -0,0 +1,53 @@ @@ -0,0 +1,53 @@
{
"nodes": [
{
"id": "producer",
"type": "webserver",
"position": { "x": 0, "y": 0 },
"props": {
"label": "Message Producer",
"rpsCapacity": 50
}
},
{
"id": "messagequeue",
"type": "messageQueue",
"position": { "x": 100, "y": 0 },
"props": {
"label": "Event Queue",
"queueCapacity": 1000,
"retentionSeconds": 3600,
"processingRate": 100
}
},
{
"id": "consumer",
"type": "webserver",
"position": { "x": 200, "y": 0 },
"props": {
"label": "Message Consumer",
"rpsCapacity": 80
}
}
],
"connections": [
{
"source": "producer",
"target": "messagequeue",
"label": "Publish Messages",
"direction": "forward",
"protocol": "AMQP",
"tls": false,
"capacity": 1000
},
{
"source": "messagequeue",
"target": "consumer",
"label": "Consume Messages",
"direction": "forward",
"protocol": "AMQP",
"tls": false,
"capacity": 1000
}
]
}

96
internal/simulation/testdata/microservice_design.json vendored

@ -0,0 +1,96 @@ @@ -0,0 +1,96 @@
{
"nodes": [
{
"id": "webserver-1",
"type": "webserver",
"position": { "x": 100, "y": 200 },
"props": {
"label": "API Gateway",
"rpsCapacity": 200
}
},
{
"id": "lb-1",
"type": "loadbalancer",
"position": { "x": 300, "y": 200 },
"props": {
"label": "API Gateway",
"algorithm": "round-robin"
}
},
{
"id": "microservice-1",
"type": "microservice",
"position": { "x": 500, "y": 150 },
"props": {
"label": "User Service",
"instanceCount": 3,
"cpu": 4,
"ramGb": 8,
"rpsCapacity": 100,
"monthlyUsd": 150,
"scalingStrategy": "auto",
"apiVersion": "v2"
}
},
{
"id": "microservice-2",
"type": "microservice",
"position": { "x": 500, "y": 250 },
"props": {
"label": "Order Service",
"instanceCount": 2,
"cpu": 2,
"ramGb": 4,
"rpsCapacity": 80,
"monthlyUsd": 90,
"scalingStrategy": "manual",
"apiVersion": "v1"
}
},
{
"id": "db-1",
"type": "database",
"position": { "x": 700, "y": 200 },
"props": {
"label": "PostgreSQL",
"replication": 2,
"maxRPS": 500,
"baseLatencyMs": 15
}
}
],
"connections": [
{
"source": "webserver-1",
"target": "lb-1",
"label": "HTTPS Requests",
"protocol": "https",
"tls": true
},
{
"source": "lb-1",
"target": "microservice-1",
"label": "User API",
"protocol": "http"
},
{
"source": "lb-1",
"target": "microservice-2",
"label": "Order API",
"protocol": "http"
},
{
"source": "microservice-1",
"target": "db-1",
"label": "User Queries",
"protocol": "tcp"
},
{
"source": "microservice-2",
"target": "db-1",
"label": "Order Queries",
"protocol": "tcp"
}
]
}

127
internal/simulation/testdata/monitoring_design.json vendored

@ -0,0 +1,127 @@ @@ -0,0 +1,127 @@
{
"nodes": [
{
"id": "webserver-1",
"type": "webserver",
"position": { "x": 100, "y": 200 },
"props": {
"label": "Web Server",
"rpsCapacity": 100
}
},
{
"id": "monitor-1",
"type": "monitoring/alerting",
"position": { "x": 300, "y": 200 },
"props": {
"label": "Prometheus Monitor",
"tool": "Prometheus",
"alertMetric": "latency",
"thresholdValue": 80,
"thresholdUnit": "ms"
}
},
{
"id": "lb-1",
"type": "loadbalancer",
"position": { "x": 500, "y": 200 },
"props": {
"label": "Load Balancer",
"algorithm": "round-robin"
}
},
{
"id": "microservice-1",
"type": "microservice",
"position": { "x": 700, "y": 150 },
"props": {
"label": "User Service",
"instanceCount": 2,
"cpu": 2,
"ramGb": 4,
"rpsCapacity": 50,
"scalingStrategy": "auto"
}
},
{
"id": "microservice-2",
"type": "microservice",
"position": { "x": 700, "y": 250 },
"props": {
"label": "Order Service",
"instanceCount": 1,
"cpu": 1,
"ramGb": 2,
"rpsCapacity": 30,
"scalingStrategy": "manual"
}
},
{
"id": "monitor-2",
"type": "monitoring/alerting",
"position": { "x": 900, "y": 200 },
"props": {
"label": "Error Rate Monitor",
"tool": "Datadog",
"alertMetric": "error_rate",
"thresholdValue": 5,
"thresholdUnit": "percent"
}
},
{
"id": "db-1",
"type": "database",
"position": { "x": 1100, "y": 200 },
"props": {
"label": "PostgreSQL",
"replication": 2,
"maxRPS": 200,
"baseLatencyMs": 15
}
}
],
"connections": [
{
"source": "webserver-1",
"target": "monitor-1",
"label": "HTTP Requests",
"protocol": "http"
},
{
"source": "monitor-1",
"target": "lb-1",
"label": "Monitored Requests",
"protocol": "http"
},
{
"source": "lb-1",
"target": "microservice-1",
"label": "User API",
"protocol": "http"
},
{
"source": "lb-1",
"target": "microservice-2",
"label": "Order API",
"protocol": "http"
},
{
"source": "microservice-1",
"target": "monitor-2",
"label": "Service Metrics",
"protocol": "http"
},
{
"source": "microservice-2",
"target": "monitor-2",
"label": "Service Metrics",
"protocol": "http"
},
{
"source": "monitor-2",
"target": "db-1",
"label": "Database Queries",
"protocol": "tcp"
}
]
}

2
internal/simulation/testdata/simple_design.json vendored

@ -16,7 +16,7 @@ @@ -16,7 +16,7 @@
"props": {
"label": "Web Server",
"instanceSize": "medium",
"capacityRPS": 5,
"rpsCapacity": 5,
"baseLatencyMs": 50,
"penaltyPerRPS": 10
}

164
internal/simulation/testdata/thirdpartyservice_design.json vendored

@ -0,0 +1,164 @@ @@ -0,0 +1,164 @@
{
"nodes": [
{
"id": "webserver-1",
"type": "webserver",
"position": { "x": 100, "y": 200 },
"props": {
"label": "E-commerce API",
"rpsCapacity": 200
}
},
{
"id": "microservice-1",
"type": "microservice",
"position": { "x": 300, "y": 200 },
"props": {
"label": "Payment Service",
"instanceCount": 2,
"cpu": 4,
"ramGb": 8,
"rpsCapacity": 100,
"scalingStrategy": "auto"
}
},
{
"id": "stripe-service",
"type": "third party service",
"position": { "x": 500, "y": 150 },
"props": {
"label": "Stripe Payments",
"provider": "Stripe",
"latency": 180
}
},
{
"id": "twilio-service",
"type": "third party service",
"position": { "x": 500, "y": 250 },
"props": {
"label": "SMS Notifications",
"provider": "Twilio",
"latency": 250
}
},
{
"id": "microservice-2",
"type": "microservice",
"position": { "x": 300, "y": 350 },
"props": {
"label": "Notification Service",
"instanceCount": 1,
"cpu": 2,
"ramGb": 4,
"rpsCapacity": 50,
"scalingStrategy": "manual"
}
},
{
"id": "sendgrid-service",
"type": "third party service",
"position": { "x": 500, "y": 350 },
"props": {
"label": "Email Service",
"provider": "SendGrid",
"latency": 200
}
},
{
"id": "slack-service",
"type": "third party service",
"position": { "x": 500, "y": 450 },
"props": {
"label": "Slack Alerts",
"provider": "Slack",
"latency": 300
}
},
{
"id": "monitor-1",
"type": "monitoring/alerting",
"position": { "x": 700, "y": 200 },
"props": {
"label": "System Monitor",
"tool": "Datadog",
"alertMetric": "latency",
"thresholdValue": 500,
"thresholdUnit": "ms"
}
},
{
"id": "db-1",
"type": "database",
"position": { "x": 700, "y": 350 },
"props": {
"label": "Transaction DB",
"replication": 2,
"maxRPS": 300,
"baseLatencyMs": 20
}
}
],
"connections": [
{
"source": "webserver-1",
"target": "microservice-1",
"label": "Payment Requests",
"protocol": "https"
},
{
"source": "microservice-1",
"target": "stripe-service",
"label": "Process Payment",
"protocol": "https"
},
{
"source": "microservice-1",
"target": "twilio-service",
"label": "SMS Confirmation",
"protocol": "https"
},
{
"source": "webserver-1",
"target": "microservice-2",
"label": "Notification Requests",
"protocol": "https"
},
{
"source": "microservice-2",
"target": "sendgrid-service",
"label": "Send Email",
"protocol": "https"
},
{
"source": "microservice-2",
"target": "slack-service",
"label": "Admin Alerts",
"protocol": "https"
},
{
"source": "stripe-service",
"target": "monitor-1",
"label": "Payment Metrics",
"protocol": "http"
},
{
"source": "twilio-service",
"target": "monitor-1",
"label": "SMS Metrics",
"protocol": "http"
},
{
"source": "sendgrid-service",
"target": "monitor-1",
"label": "Email Metrics",
"protocol": "http"
},
{
"source": "monitor-1",
"target": "db-1",
"label": "Store Metrics",
"protocol": "tcp"
}
]
}

219
internal/simulation/thirdpartyservice.go

@ -0,0 +1,219 @@ @@ -0,0 +1,219 @@
package simulation
import (
"math/rand"
)
type ThirdPartyServiceLogic struct{}
type ServiceStatus struct {
IsUp bool
LastCheck int
FailureCount int
SuccessCount int
RateLimitHits int
}
func (t ThirdPartyServiceLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
// Extract third-party service properties
provider := AsString(props["provider"])
if provider == "" {
provider = "Generic" // default provider
}
baseLatency := int(AsFloat64(props["latency"]))
if baseLatency == 0 {
baseLatency = 200 // default 200ms latency
}
// Get service status from props (persistent state)
status, ok := props["_serviceStatus"].(ServiceStatus)
if !ok {
status = ServiceStatus{
IsUp: true,
LastCheck: tick,
FailureCount: 0,
SuccessCount: 0,
RateLimitHits: 0,
}
}
currentTime := tick * 100 // Convert tick to milliseconds
// Simulate service availability and characteristics based on provider
reliability := t.getProviderReliability(provider)
rateLimitRPS := t.getProviderRateLimit(provider)
latencyVariance := t.getProviderLatencyVariance(provider)
// Check if service is down and should recover
if !status.IsUp {
// Services typically recover after some time
if currentTime-status.LastCheck > 30000 { // 30 seconds downtime
status.IsUp = true
status.FailureCount = 0
}
}
// Apply rate limiting - third-party services often have strict limits
requestsThisTick := len(queue)
if requestsThisTick > rateLimitRPS {
status.RateLimitHits++
// Only process up to rate limit
queue = queue[:rateLimitRPS]
}
output := []*Request{}
for _, req := range queue {
reqCopy := *req
// Simulate service availability
if !status.IsUp {
// Service is down - simulate timeout/error
reqCopy.LatencyMS += 10000 // 10 second timeout
reqCopy.Path = append(reqCopy.Path, "third-party-timeout")
status.FailureCount++
} else {
// Service is up - calculate response time
serviceLatency := t.calculateServiceLatency(provider, baseLatency, latencyVariance)
// Random failure based on reliability
if rand.Float64() > reliability {
// Service call failed
serviceLatency += 5000 // 5 second timeout on failure
reqCopy.Path = append(reqCopy.Path, "third-party-failed")
status.FailureCount++
// If too many failures, mark service as down
if status.FailureCount > 5 {
status.IsUp = false
status.LastCheck = currentTime
}
} else {
// Successful service call
reqCopy.Path = append(reqCopy.Path, "third-party-success")
status.SuccessCount++
// Reset failure count on successful calls
if status.FailureCount > 0 {
status.FailureCount--
}
}
reqCopy.LatencyMS += serviceLatency
}
output = append(output, &reqCopy)
}
// Update persistent state
props["_serviceStatus"] = status
// Health check: service is healthy if external service is up and not excessively rate limited
// Allow some rate limiting but not too much
maxRateLimitHits := 10 // Allow up to 10 rate limit hits before considering unhealthy
healthy := status.IsUp && status.RateLimitHits < maxRateLimitHits
return output, healthy
}
// getProviderReliability returns the reliability percentage for different providers
func (t ThirdPartyServiceLogic) getProviderReliability(provider string) float64 {
switch provider {
case "Stripe":
return 0.999 // 99.9% uptime
case "Twilio":
return 0.998 // 99.8% uptime
case "SendGrid":
return 0.997 // 99.7% uptime
case "AWS":
return 0.9995 // 99.95% uptime
case "Google":
return 0.9999 // 99.99% uptime
case "Slack":
return 0.995 // 99.5% uptime
case "GitHub":
return 0.996 // 99.6% uptime
case "Shopify":
return 0.998 // 99.8% uptime
default:
return 0.99 // 99% uptime for generic services
}
}
// getProviderRateLimit returns the rate limit (requests per tick) for different providers
func (t ThirdPartyServiceLogic) getProviderRateLimit(provider string) int {
switch provider {
case "Stripe":
return 100 // 100 requests per second (per tick in our sim)
case "Twilio":
return 50 // More restrictive
case "SendGrid":
return 200 // Email is typically higher volume
case "AWS":
return 1000 // Very high limits
case "Google":
return 500 // High but controlled
case "Slack":
return 30 // Very restrictive for chat APIs
case "GitHub":
return 60 // GitHub API limits
case "Shopify":
return 80 // E-commerce API limits
default:
return 100 // Default rate limit
}
}
// getProviderLatencyVariance returns the latency variance factor for different providers
func (t ThirdPartyServiceLogic) getProviderLatencyVariance(provider string) float64 {
switch provider {
case "Stripe":
return 0.3 // Low variance, consistent performance
case "Twilio":
return 0.5 // Moderate variance
case "SendGrid":
return 0.4 // Email services are fairly consistent
case "AWS":
return 0.2 // Very consistent
case "Google":
return 0.25 // Very consistent
case "Slack":
return 0.6 // Chat services can be variable
case "GitHub":
return 0.4 // Moderate variance
case "Shopify":
return 0.5 // E-commerce can be variable under load
default:
return 0.5 // Default variance
}
}
// calculateServiceLatency computes the actual latency including variance
func (t ThirdPartyServiceLogic) calculateServiceLatency(provider string, baseLatency int, variance float64) int {
// Add random variance to base latency
varianceMs := float64(baseLatency) * variance
randomVariance := (rand.Float64() - 0.5) * 2 * varianceMs // -variance to +variance
finalLatency := float64(baseLatency) + randomVariance
// Ensure minimum latency (can't be negative or too low)
if finalLatency < 10 {
finalLatency = 10
}
// Add provider-specific baseline adjustments
switch provider {
case "AWS", "Google":
// Cloud providers are typically fast
finalLatency *= 0.8
case "Slack":
// Chat APIs can be slower
finalLatency *= 1.2
case "Twilio":
// Telecom APIs have processing overhead
finalLatency *= 1.1
}
return int(finalLatency)
}

382
internal/simulation/thirdpartyservice_test.go

@ -0,0 +1,382 @@ @@ -0,0 +1,382 @@
package simulation
import (
"testing"
)
func TestThirdPartyServiceLogic_BasicProcessing(t *testing.T) {
logic := ThirdPartyServiceLogic{}
props := map[string]any{
"provider": "Stripe",
"latency": 150.0,
}
requests := []*Request{
{ID: "1", Type: "POST", LatencyMS: 50, Path: []string{}},
{ID: "2", Type: "GET", LatencyMS: 30, Path: []string{}},
}
output, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected third party service to be healthy")
}
if len(output) != 2 {
t.Errorf("Expected 2 processed requests, got %d", len(output))
}
// Verify latency was added (should be around base latency with some variance)
for i, req := range output {
originalLatency := requests[i].LatencyMS
if req.LatencyMS <= originalLatency {
t.Errorf("Expected third party service latency to be added")
}
// Check that path was updated
if len(req.Path) == 0 {
t.Error("Expected path to be updated")
}
lastPathElement := req.Path[len(req.Path)-1]
if lastPathElement != "third-party-success" && lastPathElement != "third-party-failed" {
t.Errorf("Expected path to indicate success or failure, got %s", lastPathElement)
}
}
}
func TestThirdPartyServiceLogic_ProviderCharacteristics(t *testing.T) {
logic := ThirdPartyServiceLogic{}
providers := []string{"Stripe", "AWS", "Slack", "Twilio"}
for _, provider := range providers {
t.Run(provider, func(t *testing.T) {
props := map[string]any{
"provider": provider,
"latency": 100.0,
}
requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}}
output, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Errorf("Expected %s service to be healthy", provider)
}
if len(output) != 1 {
t.Errorf("Expected 1 processed request for %s", provider)
}
// Verify latency characteristics
addedLatency := output[0].LatencyMS
if addedLatency <= 0 {
t.Errorf("Expected %s to add latency", provider)
}
// AWS and Google should be faster than Slack
if provider == "AWS" && addedLatency > 200 {
t.Errorf("Expected AWS to have lower latency, got %dms", addedLatency)
}
})
}
}
func TestThirdPartyServiceLogic_RateLimiting(t *testing.T) {
logic := ThirdPartyServiceLogic{}
props := map[string]any{
"provider": "Slack", // Has low rate limit (30 RPS)
"latency": 100.0,
}
// Send more requests than rate limit
requests := make([]*Request, 50) // More than Slack's 30 RPS limit
for i := range requests {
requests[i] = &Request{ID: string(rune('1' + i)), Type: "POST", LatencyMS: 0}
}
output, healthy := logic.Tick(props, requests, 1)
// Should only process up to rate limit
if len(output) != 30 {
t.Errorf("Expected 30 processed requests due to Slack rate limit, got %d", len(output))
}
// Service should still be healthy with rate limiting
if !healthy {
t.Error("Expected service to be healthy despite rate limiting")
}
// Check that rate limit hits were recorded
status, ok := props["_serviceStatus"].(ServiceStatus)
if !ok {
t.Error("Expected service status to be recorded")
}
if status.RateLimitHits != 1 {
t.Errorf("Expected 1 rate limit hit, got %d", status.RateLimitHits)
}
}
func TestThirdPartyServiceLogic_ServiceFailure(t *testing.T) {
logic := ThirdPartyServiceLogic{}
props := map[string]any{
"provider": "Generic",
"latency": 100.0,
}
// Set up service as already having failures
status := ServiceStatus{
IsUp: false,
LastCheck: 0,
FailureCount: 6,
}
props["_serviceStatus"] = status
requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 50, Path: []string{}}}
output, healthy := logic.Tick(props, requests, 1)
if healthy {
t.Error("Expected service to be unhealthy when external service is down")
}
if len(output) != 1 {
t.Error("Expected request to be processed even when service is down")
}
// Should have very high latency due to timeout
if output[0].LatencyMS < 5000 {
t.Errorf("Expected high latency for service failure, got %dms", output[0].LatencyMS)
}
// Check path indicates timeout
lastPath := output[0].Path[len(output[0].Path)-1]
if lastPath != "third-party-timeout" {
t.Errorf("Expected timeout path, got %s", lastPath)
}
}
func TestThirdPartyServiceLogic_ServiceRecovery(t *testing.T) {
logic := ThirdPartyServiceLogic{}
props := map[string]any{
"provider": "Stripe",
"latency": 100.0,
}
// Set up service as down but with old timestamp (should recover)
status := ServiceStatus{
IsUp: false,
LastCheck: 0, // Very old timestamp
FailureCount: 3,
}
props["_serviceStatus"] = status
requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 50, Path: []string{}}}
// Run with current tick that's more than 30 seconds later
_, healthy := logic.Tick(props, requests, 400) // 40 seconds later
if !healthy {
t.Error("Expected service to be healthy after recovery")
}
// Check that service recovered
updatedStatus, ok := props["_serviceStatus"].(ServiceStatus)
if !ok {
t.Error("Expected updated service status")
}
if !updatedStatus.IsUp {
t.Error("Expected service to have recovered")
}
if updatedStatus.FailureCount != 0 {
t.Error("Expected failure count to be reset on recovery")
}
}
func TestThirdPartyServiceLogic_ReliabilityDifferences(t *testing.T) {
logic := ThirdPartyServiceLogic{}
// Test different reliability levels
testCases := []struct {
provider string
expectedReliability float64
}{
{"AWS", 0.9995},
{"Google", 0.9999},
{"Stripe", 0.999},
{"Slack", 0.995},
{"Generic", 0.99},
}
for _, tc := range testCases {
reliability := logic.getProviderReliability(tc.provider)
if reliability != tc.expectedReliability {
t.Errorf("Expected %s reliability %.4f, got %.4f",
tc.provider, tc.expectedReliability, reliability)
}
}
}
func TestThirdPartyServiceLogic_RateLimitDifferences(t *testing.T) {
logic := ThirdPartyServiceLogic{}
// Test different rate limits
testCases := []struct {
provider string
expectedLimit int
}{
{"AWS", 1000},
{"Stripe", 100},
{"Slack", 30},
{"SendGrid", 200},
{"Twilio", 50},
}
for _, tc := range testCases {
rateLimit := logic.getProviderRateLimit(tc.provider)
if rateLimit != tc.expectedLimit {
t.Errorf("Expected %s rate limit %d, got %d",
tc.provider, tc.expectedLimit, rateLimit)
}
}
}
func TestThirdPartyServiceLogic_LatencyVariance(t *testing.T) {
logic := ThirdPartyServiceLogic{}
props := map[string]any{
"provider": "Stripe",
"latency": 100.0,
}
requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}}
latencies := []int{}
// Run multiple times to observe variance
for i := 0; i < 10; i++ {
output, _ := logic.Tick(props, requests, i)
latencies = append(latencies, output[0].LatencyMS)
}
// Check that we have variance (not all latencies are the same)
allSame := true
firstLatency := latencies[0]
for _, latency := range latencies[1:] {
if latency != firstLatency {
allSame = false
break
}
}
if allSame {
t.Error("Expected latency variance, but all latencies were the same")
}
// All latencies should be reasonable (between 50ms and 300ms for Stripe)
for _, latency := range latencies {
if latency < 50 || latency > 300 {
t.Errorf("Expected reasonable latency for Stripe, got %dms", latency)
}
}
}
func TestThirdPartyServiceLogic_DefaultValues(t *testing.T) {
logic := ThirdPartyServiceLogic{}
// Empty props should use defaults
props := map[string]any{}
requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}}
output, healthy := logic.Tick(props, requests, 1)
if !healthy {
t.Error("Expected service to be healthy with default values")
}
if len(output) != 1 {
t.Error("Expected 1 processed request with defaults")
}
// Should have reasonable default latency (around 200ms base)
if output[0].LatencyMS < 100 || output[0].LatencyMS > 400 {
t.Errorf("Expected reasonable default latency, got %dms", output[0].LatencyMS)
}
}
func TestThirdPartyServiceLogic_SuccessCountTracking(t *testing.T) {
logic := ThirdPartyServiceLogic{}
props := map[string]any{
"provider": "AWS", // High reliability
"latency": 50.0,
}
requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}}
// Run multiple successful requests
for i := 0; i < 5; i++ {
logic.Tick(props, requests, i)
}
status, ok := props["_serviceStatus"].(ServiceStatus)
if !ok {
t.Error("Expected service status to be tracked")
}
// Should have accumulated success count
if status.SuccessCount == 0 {
t.Error("Expected success count to be tracked")
}
// Should be healthy
if !status.IsUp {
t.Error("Expected service to remain up with successful calls")
}
}
func TestThirdPartyServiceLogic_FailureRecovery(t *testing.T) {
logic := ThirdPartyServiceLogic{}
props := map[string]any{
"provider": "Generic",
"latency": 100.0,
}
// Set up service with some failures but still up
status := ServiceStatus{
IsUp: true,
FailureCount: 3,
SuccessCount: 0,
}
props["_serviceStatus"] = status
requests := []*Request{{ID: "1", Type: "POST", LatencyMS: 0, Path: []string{}}}
// Simulate a successful call (with high probability for Generic service)
// We'll run this multiple times to ensure we get at least one success
successFound := false
for i := 0; i < 10 && !successFound; i++ {
output, _ := logic.Tick(props, requests, i)
if len(output[0].Path) > 0 && output[0].Path[len(output[0].Path)-1] == "third-party-success" {
successFound = true
}
}
if successFound {
updatedStatus, _ := props["_serviceStatus"].(ServiceStatus)
// Failure count should have decreased
if updatedStatus.FailureCount >= 3 {
t.Error("Expected failure count to decrease after successful call")
}
}
}

2
internal/simulation/webserver.go

@ -6,7 +6,7 @@ type WebServerLogic struct { @@ -6,7 +6,7 @@ type WebServerLogic struct {
}
func (l WebServerLogic) Tick(props map[string]any, queue []*Request, tick int) ([]*Request, bool) {
maxRPS := int(AsFloat64(props["capacityRPS"]))
maxRPS := int(AsFloat64(props["rpsCapacity"]))
toProcess := queue
if len(queue) > maxRPS {

422
router/handlers/simulation.go

@ -2,17 +2,24 @@ package handlers @@ -2,17 +2,24 @@ package handlers
import (
"encoding/json"
"fmt"
"net/http"
"systemdesigngame/internal/design"
"systemdesigngame/internal/level"
"systemdesigngame/internal/simulation"
)
type SimulationHandler struct{}
type SimulationResponse struct {
Success bool `json:"success"`
Metrics map[string]interface{} `json:"metrics,omitempty"`
Timeline []interface{} `json:"timeline,omitempty"`
Error string `json:"error,omitempty"`
Success bool `json:"success"`
Metrics map[string]interface{} `json:"metrics,omitempty"`
Timeline []interface{} `json:"timeline,omitempty"`
Passed bool `json:"passed,omitempty"`
Score int `json:"score,omitempty"`
Feedback []string `json:"feedback,omitempty"`
LevelName string `json:"levelName,omitempty"`
Error string `json:"error,omitempty"`
}
func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
@ -21,22 +28,96 @@ func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { @@ -21,22 +28,96 @@ func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
return
}
var design design.Design
if err := json.NewDecoder(r.Body).Decode(&design); err != nil {
http.Error(w, "Invalid design JSON: "+err.Error(), http.StatusBadRequest)
var requestBody struct {
Design design.Design `json:"design"`
LevelName string `json:"levelName,omitempty"`
Difficulty string `json:"difficulty,omitempty"`
}
if err := json.NewDecoder(r.Body).Decode(&requestBody); err != nil {
// Try to decode as just design for backward compatibility
r.Body.Close()
var design design.Design
if err2 := json.NewDecoder(r.Body).Decode(&design); err2 != nil {
http.Error(w, "Invalid request JSON: "+err.Error(), http.StatusBadRequest)
return
}
requestBody.Design = design
}
// Extract the design for processing
design := requestBody.Design
// Run the actual simulation
engine := simulation.NewEngineFromDesign(design, 100)
if engine == nil {
response := SimulationResponse{
Success: false,
Error: "Failed to create simulation engine - no valid components found",
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
return
}
// For now, return a mock successful response but eventually, we want to go to the results page(s)
// Set simulation parameters
engine.RPS = 50 // Default RPS - could be configurable later
// Find entry node by analyzing topology
entryNode := findEntryNode(design)
if entryNode == "" {
response := SimulationResponse{
Success: false,
Error: "No entry point found - design must include a component with no incoming connections (webserver, microservice, load balancer, etc.)",
}
w.Header().Set("Content-Type", "application/json")
json.NewEncoder(w).Encode(response)
return
}
engine.EntryNode = entryNode
// Run simulation for 60 ticks (6 seconds at 100ms per tick)
snapshots := engine.Run(60, 100)
// Calculate metrics from snapshots
metrics := calculateMetrics(snapshots)
// Convert snapshots to interface{} for JSON serialization
timeline := make([]interface{}, len(snapshots))
for i, snapshot := range snapshots {
timeline[i] = snapshot
}
// Perform level validation if level info provided
var passed bool
var score int
var feedback []string
var levelName string
if requestBody.LevelName != "" {
difficulty := level.DifficultyEasy // default
if requestBody.Difficulty != "" {
difficulty = level.Difficulty(requestBody.Difficulty)
}
if lvl, err := level.GetLevel(requestBody.LevelName, difficulty); err == nil {
levelName = lvl.Name
passed, score, feedback = validateLevel(lvl, design, metrics)
} else {
feedback = []string{"Warning: Level not found, simulation ran without validation"}
}
}
response := SimulationResponse{
Success: true,
Metrics: map[string]interface{}{
"throughput": 250,
"latency_p95": 85,
"cost_monthly": 120,
"availability": 99.5,
},
Timeline: []interface{}{}, // Will contain TickSnapshots later
Success: true,
Metrics: metrics,
Timeline: timeline,
Passed: passed,
Score: score,
Feedback: feedback,
LevelName: levelName,
}
w.Header().Set("Content-Type", "application/json")
@ -45,3 +126,312 @@ func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { @@ -45,3 +126,312 @@ func (h *SimulationHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
return
}
}
// calculateMetrics computes key performance metrics from simulation snapshots
func calculateMetrics(snapshots []*simulation.TickSnapshot) map[string]interface{} {
if len(snapshots) == 0 {
return map[string]interface{}{
"throughput": 0,
"latency_avg": 0,
"cost_monthly": 0,
"availability": 0,
}
}
totalRequests := 0
totalLatency := 0
totalHealthy := 0
totalNodes := 0
// Calculate aggregate metrics across all snapshots
for _, snapshot := range snapshots {
// Count total requests processed in this tick
for _, requests := range snapshot.Emitted {
totalRequests += len(requests)
for _, req := range requests {
totalLatency += req.LatencyMS
}
}
// Count healthy vs total nodes
for _, healthy := range snapshot.NodeHealth {
totalNodes++
if healthy {
totalHealthy++
}
}
}
// Calculate throughput (requests per second)
// snapshots represent 6 seconds of simulation (60 ticks * 100ms)
simulationSeconds := float64(len(snapshots)) * 0.1 // 100ms per tick
throughput := float64(totalRequests) / simulationSeconds
// Calculate average latency
avgLatency := 0.0
if totalRequests > 0 {
avgLatency = float64(totalLatency) / float64(totalRequests)
}
// Calculate availability percentage
availability := 0.0
if totalNodes > 0 {
availability = (float64(totalHealthy) / float64(totalNodes)) * 100
}
// Estimate monthly cost (placeholder - could be enhanced)
monthlyCost := float64(totalNodes) * 50 // $50 per node per month baseline
return map[string]interface{}{
"throughput": int(throughput),
"latency_avg": int(avgLatency),
"cost_monthly": int(monthlyCost),
"availability": availability,
}
}
// findEntryNode analyzes the design topology to find the best entry point
func findEntryNode(design design.Design) string {
// Build map of incoming connections
incomingCount := make(map[string]int)
// Initialize all nodes with 0 incoming connections
for _, node := range design.Nodes {
incomingCount[node.ID] = 0
}
// Count incoming connections for each node
for _, conn := range design.Connections {
incomingCount[conn.Target]++
}
// Find nodes with no incoming connections (potential entry points)
var entryPoints []string
for nodeID, count := range incomingCount {
if count == 0 {
entryPoints = append(entryPoints, nodeID)
}
}
// If multiple entry points exist, prefer certain types
if len(entryPoints) > 1 {
return preferredEntryPoint(design.Nodes, entryPoints)
} else if len(entryPoints) == 1 {
return entryPoints[0]
}
return "" // No entry point found
}
// preferredEntryPoint selects the best entry point from candidates based on component type
func preferredEntryPoint(nodes []design.Node, candidateIDs []string) string {
// Priority order for entry points (most logical first)
priority := []string{
"webserver",
"microservice",
"loadBalancer", // Could be edge load balancer
"cdn", // Edge CDN
"data pipeline", // Data ingestion entry
"messageQueue", // For event-driven architectures
}
// Create lookup for candidate nodes
candidates := make(map[string]design.Node)
for _, node := range nodes {
for _, id := range candidateIDs {
if node.ID == id {
candidates[id] = node
break
}
}
}
// Find highest priority candidate
for _, nodeType := range priority {
for id, node := range candidates {
if node.Type == nodeType {
return id
}
}
}
// If no preferred type, return first candidate
if len(candidateIDs) > 0 {
return candidateIDs[0]
}
return ""
}
// validateLevel checks if the design and simulation results meet level requirements
func validateLevel(lvl *level.Level, design design.Design, metrics map[string]interface{}) (bool, int, []string) {
var feedback []string
var failedRequirements []string
var passedRequirements []string
// Extract metrics
throughput := metrics["throughput"].(int)
avgLatency := metrics["latency_avg"].(int)
availability := metrics["availability"].(float64)
monthlyCost := metrics["cost_monthly"].(int)
// Check throughput requirement
if throughput >= lvl.TargetRPS {
passedRequirements = append(passedRequirements, "Throughput requirement met")
} else {
failedRequirements = append(failedRequirements,
fmt.Sprintf("Throughput: %d RPS (required: %d RPS)", throughput, lvl.TargetRPS))
}
// Check latency requirement (using avg latency as approximation for P95)
if avgLatency <= lvl.MaxP95LatencyMs {
passedRequirements = append(passedRequirements, "Latency requirement met")
} else {
failedRequirements = append(failedRequirements,
fmt.Sprintf("Latency: %dms (max allowed: %dms)", avgLatency, lvl.MaxP95LatencyMs))
}
// Check availability requirement
if availability >= lvl.RequiredAvailabilityPct {
passedRequirements = append(passedRequirements, "Availability requirement met")
} else {
failedRequirements = append(failedRequirements,
fmt.Sprintf("Availability: %.1f%% (required: %.1f%%)", availability, lvl.RequiredAvailabilityPct))
}
// Check cost requirement
if monthlyCost <= lvl.MaxMonthlyUSD {
passedRequirements = append(passedRequirements, "Cost requirement met")
} else {
failedRequirements = append(failedRequirements,
fmt.Sprintf("Cost: $%d/month (max allowed: $%d/month)", monthlyCost, lvl.MaxMonthlyUSD))
}
// Check component requirements
componentFeedback := validateComponentRequirements(lvl, design)
if len(componentFeedback.Failed) > 0 {
failedRequirements = append(failedRequirements, componentFeedback.Failed...)
}
if len(componentFeedback.Passed) > 0 {
passedRequirements = append(passedRequirements, componentFeedback.Passed...)
}
// Determine if passed
passed := len(failedRequirements) == 0
// Calculate score (0-100)
score := calculateScore(len(passedRequirements), len(failedRequirements), metrics)
// Build feedback
if passed {
feedback = append(feedback, "Level completed successfully!")
feedback = append(feedback, "")
feedback = append(feedback, passedRequirements...)
} else {
feedback = append(feedback, "Level failed - requirements not met:")
feedback = append(feedback, "")
feedback = append(feedback, failedRequirements...)
if len(passedRequirements) > 0 {
feedback = append(feedback, "")
feedback = append(feedback, "Requirements passed:")
feedback = append(feedback, passedRequirements...)
}
}
return passed, score, feedback
}
type ComponentValidationResult struct {
Passed []string
Failed []string
}
// validateComponentRequirements checks mustInclude, mustNotInclude, etc.
func validateComponentRequirements(lvl *level.Level, design design.Design) ComponentValidationResult {
result := ComponentValidationResult{}
// Build map of component types in design
componentTypes := make(map[string]int)
for _, node := range design.Nodes {
componentTypes[node.Type]++
}
// Check mustInclude requirements
for _, required := range lvl.MustInclude {
if count, exists := componentTypes[required]; exists && count > 0 {
result.Passed = append(result.Passed, fmt.Sprintf("Required component '%s' included", required))
} else {
result.Failed = append(result.Failed, fmt.Sprintf("Missing required component: '%s'", required))
}
}
// Check mustNotInclude requirements
for _, forbidden := range lvl.MustNotInclude {
if count, exists := componentTypes[forbidden]; exists && count > 0 {
result.Failed = append(result.Failed, fmt.Sprintf("Forbidden component used: '%s'", forbidden))
}
}
// Check minReplicas requirements
for component, minCount := range lvl.MinReplicas {
if count, exists := componentTypes[component]; exists && count >= minCount {
result.Passed = append(result.Passed, fmt.Sprintf("Sufficient '%s' replicas (%d)", component, count))
} else {
actualCount := 0
if exists {
actualCount = count
}
result.Failed = append(result.Failed,
fmt.Sprintf("Insufficient '%s' replicas: %d (minimum: %d)", component, actualCount, minCount))
}
}
return result
}
// calculateScore computes a score from 0-100 based on performance
func calculateScore(passedCount, failedCount int, metrics map[string]interface{}) int {
if failedCount > 0 {
// Failed level - score based on how many requirements passed
return (passedCount * 100) / (passedCount + failedCount)
}
// Passed level - bonus points for performance
baseScore := 70 // Base score for passing
// Performance bonuses (up to 30 points)
performanceBonus := 0
// Throughput bonus (higher throughput = better)
if throughput, ok := metrics["throughput"].(int); ok && throughput > 0 {
performanceBonus += min(10, throughput/100) // 1 point per 100 RPS, max 10
}
// Availability bonus (higher availability = better)
if availability, ok := metrics["availability"].(float64); ok {
if availability >= 99.9 {
performanceBonus += 10
} else if availability >= 99.5 {
performanceBonus += 5
}
}
// Cost efficiency bonus (lower cost = better)
if cost, ok := metrics["cost_monthly"].(int); ok && cost > 0 {
if cost <= 50 {
performanceBonus += 10
} else if cost <= 100 {
performanceBonus += 5
}
}
return min(100, baseScore+performanceBonus)
}
// Helper function
func min(a, b int) int {
if a < b {
return a
}
return b
}

1
router/router.go

@ -18,6 +18,7 @@ func SetupRoutes(tmpl *template.Template) *http.ServeMux { @@ -18,6 +18,7 @@ func SetupRoutes(tmpl *template.Template) *http.ServeMux {
mux.Handle("/simulate", auth.RequireAuth(&handlers.SimulationHandler{}))
mux.HandleFunc("/login", auth.LoginHandler)
mux.HandleFunc("/callback", auth.CallbackHandler)
mux.HandleFunc("/ws", handlers.Messages)
return mux
}

102
static/app.js

@ -91,9 +91,54 @@ export class CanvasApp { @@ -91,9 +91,54 @@ export class CanvasApp {
node.y = y;
});
this.runButton.addEventListener('click', () => {
this.runButton.addEventListener('click', async () => {
const designData = this.exportDesign();
console.log(JSON.stringify(designData))
// Try to get level info from URL or page context
const levelInfo = this.getLevelInfo();
const requestBody = {
design: designData,
...levelInfo
};
console.log('Sending design to simulation:', JSON.stringify(requestBody));
// Disable button and show loading state
this.runButton.disabled = true;
this.runButton.textContent = 'Running Simulation...';
try {
const response = await fetch('/simulate', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify(requestBody)
});
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const result = await response.json();
if (result.Success) {
console.log('Simulation successful:', result);
this.showResults(result);
} else {
console.error('Simulation failed:', result.Error);
this.showError(result.Error || 'Simulation failed');
}
} catch (error) {
console.error('Network error:', error);
this.showError('Failed to run simulation: ' + error.message);
} finally {
// Re-enable button
this.runButton.disabled = false;
this.runButton.textContent = 'Test Design';
}
});
this.canvas.addEventListener('click', () => {
@ -267,4 +312,57 @@ export class CanvasApp { @@ -267,4 +312,57 @@ export class CanvasApp {
return { nodes, connections };
}
getLevelInfo() {
// Try to extract level info from URL path like /play/url-shortener
const pathParts = window.location.pathname.split('/');
if (pathParts.length >= 3 && pathParts[1] === 'play') {
const levelName = decodeURIComponent(pathParts[2]);
return {
levelName: levelName,
difficulty: 'easy' // Default difficulty, could be enhanced later
};
}
return {};
}
showResults(result) {
const metrics = result.Metrics;
let message = '';
// Level validation results
if (result.LevelName) {
if (result.Passed) {
message += `Level "${result.LevelName}" PASSED!\n`;
message += `Score: ${result.Score}/100\n\n`;
} else {
message += `Level "${result.LevelName}" FAILED\n`;
message += `Score: ${result.Score}/100\n\n`;
}
// Add detailed feedback
if (result.Feedback && result.Feedback.length > 0) {
message += result.Feedback.join('\n') + '\n\n';
}
} else {
message += `Simulation Complete!\n\n`;
}
// Performance metrics
message += `Performance Metrics:\n`;
message += `• Throughput: ${metrics.throughput} req/sec\n`;
message += `• Avg Latency: ${metrics.latency_avg}ms\n`;
message += `• Availability: ${metrics.availability.toFixed(1)}%\n`;
message += `• Monthly Cost: $${metrics.cost_monthly}\n\n`;
message += `Timeline: ${result.Timeline.length} ticks simulated`;
alert(message);
// TODO: Later replace with redirect to results page or modal
console.log('Full simulation data:', result);
}
showError(errorMessage) {
alert(`Simulation Error:\n\n${errorMessage}\n\nPlease check your design and try again.`);
}
}

4
static/plugins/database.js

@ -5,6 +5,8 @@ PluginRegistry.register('database', { @@ -5,6 +5,8 @@ PluginRegistry.register('database', {
label: 'Database',
props: [
{ name: 'label', type: 'string', default: 'Database', group: 'label-group' },
{ name: 'replication', type: 'number', default: 1, group: 'db-group' }
{ name: 'replication', type: 'number', default: 1, group: 'db-group' },
{ name: 'maxRPS', type: 'number', default: 1000, group: 'db-group' },
{ name: 'baseLatencyMs', type: 'number', default: 10, group: 'db-group' }
]
});

3
static/plugins/messageQueue.js

@ -6,6 +6,7 @@ PluginRegistry.register('messageQueue', { @@ -6,6 +6,7 @@ PluginRegistry.register('messageQueue', {
props: [
{ name: 'label', type: 'string', default: 'MQ', group: 'label-group' },
{ name: 'queueCapacity', type: 'number', default: 10000, group: 'mq-group' },
{ name: 'retentionSeconds', type: 'number', default: 600, group: 'mq-group' }
{ name: 'retentionSeconds', type: 'number', default: 600, group: 'mq-group' },
{ name: 'processingRate', type: 'number', default: 100, group: 'mq-group' }
]
});

4
static/plugins/monitorAlerting.js

@ -6,6 +6,8 @@ PluginRegistry.register('monitoring/alerting', { @@ -6,6 +6,8 @@ PluginRegistry.register('monitoring/alerting', {
props: [
{ name: 'label', type: 'string', default: 'monitor', group: 'label-group' },
{ name: 'tool', type: 'string', default: 'Prometheus', group: 'monitor-group' },
{ name: 'alertThreshold', type: 'number', default: 80, group: 'monitor-group' }
{ name: 'alertMetric', type: 'string', default: 'latency', group: 'monitor-group' },
{ name: 'thresholdValue', type: 'number', default: 80, group: 'monitor-group' },
{ name: 'thresholdUnit', type: 'string', default: 'ms', group: 'monitor-group' }
]
});

Loading…
Cancel
Save