Close Menu
    Facebook X (Twitter) Instagram
    • Privacy Policy
    • Terms Of Service
    • Social Media Disclaimer
    • DMCA Compliance
    • Anti-Spam Policy
    Facebook X (Twitter) Instagram
    Crypto Love You
    • Home
    • Crypto News
      • Bitcoin
      • Ethereum
      • Altcoins
      • Blockchain
      • DeFi
    • AI News
    • Stock News
    • Learn
      • AI for Beginners
      • AI Tips
      • Make Money with AI
    • Reviews
    • Tools
      • Best AI Tools
      • Crypto Market Cap List
      • Stock Market Overview
      • Market Heatmap
    • Contact
    Crypto Love You
    Home»AI News»How a Haystack-Powered Multi-Agent System Detects Incidents, Investigates Metrics and Logs, and Produces Production-Grade Incident Reviews End-to-End
    How a Haystack-Powered Multi-Agent System Detects Incidents, Investigates Metrics and Logs, and Produces Production-Grade Incident Reviews End-to-End
    AI News

    How a Haystack-Powered Multi-Agent System Detects Incidents, Investigates Metrics and Logs, and Produces Production-Grade Incident Reviews End-to-End

    January 27, 20264 Mins Read
    Share
    Facebook Twitter LinkedIn Pinterest Email
    coinbase


    @tool
    def sql_investigate(query: str) -> dict:
    try:
    df = con.execute(query).df()
    head = df.head(30)
    return {
    “rows”: int(len(df)),
    “columns”: list(df.columns),
    “preview”: head.to_dict(orient=”records”)
    }
    except Exception as e:
    return {“error”: str(e)}

    @tool
    def log_pattern_scan(window_start_iso: str, window_end_iso: str, top_k: int = 8) -> dict:
    ws = pd.to_datetime(window_start_iso)
    we = pd.to_datetime(window_end_iso)
    df = logs_df[(logs_df[“ts”] >= ws) & (logs_df[“ts”] <= we)].copy()
    if df.empty:
    return {“rows”: 0, “top_error_kinds”: [], “top_services”: [], “top_endpoints”: []}
    df[“error_kind_norm”] = df[“error_kind”].fillna(“”).replace(“”, “NONE”)
    err = df[df[“level”].isin([“WARN”,”ERROR”])].copy()
    top_err = err[“error_kind_norm”].value_counts().head(int(top_k)).to_dict()
    top_svc = err[“service”].value_counts().head(int(top_k)).to_dict()
    top_ep = err[“endpoint”].value_counts().head(int(top_k)).to_dict()
    by_region = err.groupby(“region”).size().sort_values(ascending=False).head(int(top_k)).to_dict()
    p95_latency = float(np.percentile(df[“latency_ms”].values, 95))
    return {
    “rows”: int(len(df)),
    “warn_error_rows”: int(len(err)),
    “p95_latency_ms”: p95_latency,
    “top_error_kinds”: top_err,
    “top_services”: top_svc,
    “top_endpoints”: top_ep,
    “error_by_region”: by_region
    }

    @tool
    def propose_mitigations(hypothesis: str) -> dict:
    h = hypothesis.lower()
    mitigations = []
    if “conn” in h or “pool” in h or “db” in h:
    mitigations += [
    {“action”: “Increase DB connection pool size (bounded) and add backpressure at db-proxy”, “owner”: “Platform”, “eta_days”: 3},
    {“action”: “Add circuit breaker + adaptive timeouts between api-gateway and db-proxy”, “owner”: “Backend”, “eta_days”: 5},
    {“action”: “Tune query hotspots; add indexes for top offending endpoints”, “owner”: “Data/DBA”, “eta_days”: 7},
    ]
    if “timeout” in h or “upstream” in h:
    mitigations += [
    {“action”: “Implement hedged requests for idempotent calls (carefully) and tighten retry budgets”, “owner”: “Backend”, “eta_days”: 6},
    {“action”: “Add upstream SLO-aware load shedding at api-gateway”, “owner”: “Platform”, “eta_days”: 7},
    ]
    if “cache” in h:
    mitigations += [
    {“action”: “Add request coalescing and negative caching to prevent cache-miss storms”, “owner”: “Backend”, “eta_days”: 6},
    {“action”: “Prewarm cache for top endpoints during deploys”, “owner”: “SRE”, “eta_days”: 4},
    ]
    if not mitigations:
    mitigations += [
    {“action”: “Add targeted dashboards and alerts for the suspected bottleneck metric”, “owner”: “SRE”, “eta_days”: 3},
    {“action”: “Run controlled load test to reproduce and validate the hypothesis”, “owner”: “Perf Eng”, “eta_days”: 5},
    ]
    mitigations = mitigations[:10]
    return {“hypothesis”: hypothesis, “mitigations”: mitigations}

    @tool
    def draft_postmortem(title: str, window_start_iso: str, window_end_iso: str, customer_impact: str, suspected_root_cause: str, key_facts_json: str, mitigations_json: str) -> dict:
    try:
    facts = json.loads(key_facts_json)
    except Exception:
    facts = {“note”: “key_facts_json was not valid JSON”}
    try:
    mits = json.loads(mitigations_json)
    except Exception:
    mits = {“note”: “mitigations_json was not valid JSON”}
    doc = {
    “title”: title,
    “date_utc”: datetime.utcnow().strftime(“%Y-%m-%d”),
    “incident_window_utc”: {“start”: window_start_iso, “end”: window_end_iso},
    “customer_impact”: customer_impact,
    “suspected_root_cause”: suspected_root_cause,
    “detection”: {
    “how_detected”: “Automated anomaly detection + error-rate spike triage”,
    “gaps”: [“Add earlier saturation alerting”, “Improve symptom-to-cause correlation dashboards”]
    },
    “timeline”: [
    {“t”: window_start_iso, “event”: “Symptoms begin (latency/error anomalies)”},
    {“t”: “T+10m”, “event”: “On-call begins triage; identifies top services/endpoints”},
    {“t”: “T+25m”, “event”: “Mitigation actions initiated (throttling/backpressure)”},
    {“t”: window_end_iso, “event”: “Customer impact ends; metrics stabilize”},
    ],
    “key_facts”: facts,
    “corrective_actions”: mits.get(“mitigations”, mits),
    “followups”: [
    {“area”: “Reliability”, “task”: “Add saturation signals + budget-based retries”, “priority”: “P1”},
    {“area”: “Observability”, “task”: “Add golden signals per service/endpoint”, “priority”: “P1”},
    {“area”: “Performance”, “task”: “Reproduce with load test and validate fix”, “priority”: “P2”},
    ],
    “appendix”: {“notes”: “Generated by a Haystack multi-agent workflow (non-RAG).”}
    }
    return {“postmortem_json”: doc}

    synthesia

    llm = OpenAIChatGenerator(model=”gpt-4o-mini”)

    state_schema = {
    “metrics_csv_path”: {“type”: str},
    “logs_csv_path”: {“type”: str},
    “metrics_summary”: {“type”: dict},
    “logs_summary”: {“type”: dict},
    “incident_window”: {“type”: dict},
    “investigation_notes”: {“type”: list, “handler”: merge_lists},
    “hypothesis”: {“type”: str},
    “key_facts”: {“type”: dict},
    “mitigation_plan”: {“type”: dict},
    “postmortem”: {“type”: dict},
    }

    profiler_prompt = “””You are a specialist incident profiler.
    Goal: turn raw metrics/log summaries into crisp, high-signal findings.
    Rules:
    – Prefer calling tools over guessing.
    – Output must be a JSON object with keys: window, symptoms, top_contributors, hypothesis, key_facts.
    – Hypothesis must be falsifiable and mention at least one specific service and mechanism.
    “””

    writer_prompt = “””You are a specialist postmortem writer.
    Goal: produce a high-quality postmortem JSON (not prose) using the provided evidence and mitigation plan.
    Rules:
    – Call tools only if needed.
    – Keep ‘suspected_root_cause’ specific and not generic.
    – Ensure corrective actions have owners and eta_days.
    “””

    coordinator_prompt = “””You are an incident commander coordinating a non-RAG multi-agent workflow.
    You must:
    1) Load inputs
    2) Find an incident window (use p95_ms or error_rate)
    3) Investigate with targeted SQL and log pattern scan
    4) Ask the specialist profiler to synthesize evidence
    5) Propose mitigations
    6) Ask the specialist writer to draft a postmortem JSON
    Return a final response with:
    – A short executive summary (max 10 lines)
    – The postmortem JSON
    – A compact runbook checklist (bulleted)
    “””

    profiler_agent = Agent(
    chat_generator=llm,
    tools=[load_inputs, detect_incident_window, sql_investigate, log_pattern_scan],
    system_prompt=profiler_prompt,
    exit_conditions=[“text”],
    state_schema=state_schema
    )

    writer_agent = Agent(
    chat_generator=llm,
    tools=[draft_postmortem],
    system_prompt=writer_prompt,
    exit_conditions=[“text”],
    state_schema=state_schema
    )

    profiler_tool = ComponentTool(
    component=profiler_agent,
    name=”profiler_specialist”,
    description=”Synthesizes incident evidence into a falsifiable hypothesis and key facts (JSON output).”,
    outputs_to_string={“source”: “last_message”}
    )

    writer_tool = ComponentTool(
    component=writer_agent,
    name=”postmortem_writer_specialist”,
    description=”Drafts a postmortem JSON using title/window/impact/rca/facts/mitigations.”,
    outputs_to_string={“source”: “last_message”}
    )

    coordinator_agent = Agent(
    chat_generator=llm,
    tools=[
    load_inputs,
    detect_incident_window,
    sql_investigate,
    log_pattern_scan,
    propose_mitigations,
    profiler_tool,
    writer_tool,
    draft_postmortem
    ],
    system_prompt=coordinator_prompt,
    exit_conditions=[“text”],
    state_schema=state_schema
    )



    Source link

    aistudios
    Share. Facebook Twitter Pinterest LinkedIn Tumblr Email
    CryptoExpert
    • Website

    Related Posts

    Trustpilot partners with big model vendors

    March 18, 2026

    Google AI Releases WAXAL: A Multilingual African Speech Dataset for Training Automatic Speech Recognition and Text-to-Speech Models

    March 17, 2026

    U.S. Holds Off on New AI Chip Export Rules in Surprise Move in Tech Export Wars

    March 16, 2026

    Can AI help predict which heart-failure patients will worsen within a year? | MIT News

    March 15, 2026
    Add A Comment
    Leave A Reply Cancel Reply

    notion
    Latest Posts

    Trustpilot partners with big model vendors

    March 18, 2026

    I discovered how to make $100K with Nano Banana AI (Real Results) 🤯

    March 18, 2026

    AI BASICS in 10 Minutes (2026 Beginner Guide) – BeerBiceps

    March 18, 2026

    Bitcoin Price Rally To $79K Would Make Spot ETF Holders Whole Again

    March 18, 2026

    DAOs May Need To Ditch Decentralization To Court Institutions

    March 17, 2026
    binance
    LEGAL INFORMATION
    • Privacy Policy
    • Terms Of Service
    • Social Media Disclaimer
    • DMCA Compliance
    • Anti-Spam Policy
    Top Insights

    Bitget Research Analyst Breaks Down What’s Happening With The Bitcoin Price

    March 18, 2026

    Is Dogecoin Ready to Rally?

    March 18, 2026
    notion
    Facebook X (Twitter) Instagram Pinterest
    © 2026 CryptoLoveYou.com - All rights reserved.

    Type above and press Enter to search. Press Esc to cancel.