demos for network copilot and sql analyzer (#57)

* pulled from main branch after adding enums and made changes * added sql_analyzer folder and built a demo for Employee stats function calling. "top_employees" and "aggregate_stats". * sql_anayzer * After addressing PR comments * PR comments * PR comments * Addeed Network Analyzer FC Code * Added network Analyzer code for diff timeframes * Network Copilot and Employee Details demos are updated with their descriptions and resolved the PR comments * Added 2nd function in network copilot * Added 2nd function in network copilot * Added 2nd function in network copilot * Added 2nd function in network copilot * Added 2nd function in network copilot
2026-06-11 15:05:14 +02:00 · 2024-09-19 11:40:31 -07:00 · 2024-09-19 11:40:31 -07:00 · ed6a9139e6
commit ed6a9139e6
parent a91fbdbf1c
11 changed files with 1052 additions and 1 deletions
--- a/model_server/app/employee_data_generator.py
+++ b/model_server/app/employee_data_generator.py
@ -0,0 +1,59 @@
+import pandas as pd
+import random
+import datetime
+
+def generate_employee_data(conn):
+    # List of possible names, positions, departments, and locations
+    names = ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace", "Hank", "Ivy", "Jack"]
+    positions = ["Manager", "Engineer", "Salesperson", "HR Specialist", "Marketing Analyst"]
+    departments = ["Engineering", "Marketing", "HR", "Sales", "Finance"]
+    locations = ["New York", "San Francisco", "Austin", "Boston", "Chicago"]
+
+    # Function to generate random hire date
+    def random_hire_date():
+        start_date = datetime.date(2000, 1, 1)
+        end_date = datetime.date(2023, 12, 31)
+        time_between_dates = end_date - start_date
+        days_between_dates = time_between_dates.days
+        random_number_of_days = random.randrange(days_between_dates)
+        hire_date = start_date + datetime.timedelta(days=random_number_of_days)
+        return hire_date
+
+    # Function to generate random employee data
+    def generate_employee_records(count):
+        employees = []
+
+        for _ in range(count):
+            name = random.choice(names)
+            position = random.choice(positions)
+            salary = round(random.uniform(50000, 150000), 2)  # Salary between 50,000 and 150,000
+            department = random.choice(departments)
+            location = random.choice(locations)
+            hire_date = random_hire_date()
+            performance_score = round(random.uniform(1, 5), 2)  # Performance score between 1.0 and 5.0
+            years_of_experience = random.randint(1, 30)  # Years of experience between 1 and 30
+
+            employee = {
+                "position": position,
+                "name": name,
+                "salary": salary,
+                "department": department,
+                "location": location,
+                "hire_date": hire_date,
+                "performance_score": performance_score,
+                "years_of_experience": years_of_experience
+            }
+
+            employees.append(employee)
+
+        return employees
+
+    # Generate 10 random employee records
+    employee_records = generate_employee_records(200)
+
+    # Convert the list of dictionaries to a DataFrame
+    df = pd.DataFrame(employee_records)
+
+    df.to_sql('employees', conn, index=False)
+
+    return
--- a/model_server/app/load_models.py
+++ b/model_server/app/load_models.py
@ -2,6 +2,9 @@ import os
 import sentence_transformers
 from gliner import GLiNER
 from transformers import pipeline
+import sqlite3
+from employee_data_generator import generate_employee_data
+from network_data_generator import generate_device_data, generate_interface_stats_data, generate_flow_data

 def load_transformers(models = os.getenv("MODELS", "BAAI/bge-large-en-v1.5")):
    transformers = {}
@ -26,3 +29,22 @@ def load_zero_shot_models(models = os.getenv("ZERO_SHOT_MODELS", "tasksource/deb
        zero_shot_models[model] = pipeline("zero-shot-classification",model=model)

    return zero_shot_models
+
+def load_sql():
+    # Example Usage
+    conn = sqlite3.connect(':memory:')
+
+    # create and load the employees table
+    generate_employee_data(conn)
+
+    # create and load the devices table
+    device_data = generate_device_data(conn)
+
+    # create and load the interface_stats table
+    generate_interface_stats_data(conn, device_data)
+
+    # create and load the flow table
+    generate_flow_data(conn, device_data)
+
+
+    return conn
--- a/model_server/app/main.py
+++ b/model_server/app/main.py
@ -2,8 +2,16 @@ import random
 from fastapi import FastAPI, Response, HTTPException
 from pydantic import BaseModel
 from load_models import load_ner_models, load_transformers, load_zero_shot_models
-from datetime import date, timedelta
+from datetime import datetime, date, timedelta, timezone
 import string
+import pandas as pd
+from load_models import load_sql
+import logging
+from dateparser import parse
+from network_data_generator import convert_to_ago_format, load_params
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)

 transformers = load_transformers()
 ner_models = load_ner_models()
@ -144,6 +152,163 @@ async def weather(req: WeatherRequest, res: Response):

    return weather_forecast

+
+'''
+*****
+Adding new functions to test the usecases - Sampreeth
+*****
+'''
+
+conn = load_sql()
+name_col = "name"
+
+class TopEmployees(BaseModel):
+    grouping: str
+    ranking_criteria: str
+    top_n: int
+
+
+@app.post("/top_employees")
+async def top_employees(req: TopEmployees, res: Response):
+    name_col = "name"
+    # Check if `req.ranking_criteria` is a Text object and extract its value accordingly
+    logger.info(f"{'* ' * 50}\n\nCaptured Ranking Criteria: {req.ranking_criteria}\n\n{'* ' * 50}")
+
+    if req.ranking_criteria == "yoe":
+        req.ranking_criteria = "years_of_experience"
+    elif req.ranking_criteria == "rating":
+        req.ranking_criteria = "performance_score"
+
+    logger.info(f"{'* ' * 50}\n\nFinal Ranking Criteria: {req.ranking_criteria}\n\n{'* ' * 50}")
+
+
+    query = f"""
+    SELECT {req.grouping}, {name_col}, {req.ranking_criteria}
+    FROM (
+        SELECT {req.grouping}, {name_col}, {req.ranking_criteria},
+               DENSE_RANK() OVER (PARTITION BY {req.grouping} ORDER BY {req.ranking_criteria} DESC) as emp_rank
+        FROM employees
+    ) ranked_employees
+    WHERE emp_rank <= {req.top_n};
+    """
+    result_df = pd.read_sql_query(query, conn)
+    result = result_df.to_dict(orient='records')
+    return result
+
+
+class AggregateStats(BaseModel):
+    grouping: str
+    aggregate_criteria: str
+    aggregate_type: str
+
+@app.post("/aggregate_stats")
+async def aggregate_stats(req: AggregateStats, res: Response):
+    logger.info(f"{'* ' * 50}\n\nCaptured Aggregate Criteria: {req.aggregate_criteria}\n\n{'* ' * 50}")
+
+    if req.aggregate_criteria == "yoe":
+        req.aggregate_criteria = "years_of_experience"
+
+    logger.info(f"{'* ' * 50}\n\nFinal Aggregate Criteria: {req.aggregate_criteria}\n\n{'* ' * 50}")
+
+    logger.info(f"{'* ' * 50}\n\nCaptured Aggregate Type: {req.aggregate_type}\n\n{'* ' * 50}")
+    if req.aggregate_type.lower() not in ["sum", "avg", "min", "max"]:
+        if req.aggregate_type.lower() == "count":
+            req.aggregate_type = "COUNT"
+        elif req.aggregate_type.lower() == "total":
+            req.aggregate_type = "SUM"
+        elif req.aggregate_type.lower() == "average":
+            req.aggregate_type = "AVG"
+        elif req.aggregate_type.lower() == "minimum":
+            req.aggregate_type = "MIN"
+        elif req.aggregate_type.lower() == "maximum":
+            req.aggregate_type = "MAX"
+        else:
+            raise HTTPException(status_code=400, detail="Invalid aggregate type")
+
+    logger.info(f"{'* ' * 50}\n\nFinal Aggregate Type: {req.aggregate_type}\n\n{'* ' * 50}")
+
+    query = f"""
+    SELECT {req.grouping}, {req.aggregate_type}({req.aggregate_criteria}) as {req.aggregate_type}_{req.aggregate_criteria}
+    FROM employees
+    GROUP BY {req.grouping};
+    """
+    result_df = pd.read_sql_query(query, conn)
+    result = result_df.to_dict(orient='records')
+    return result
+
+class PacketDropCorrelationRequest(BaseModel):
+    from_time: str = None  # Optional natural language timeframe
+    ifname: str = None     # Optional interface name filter
+    region: str = None     # Optional region filter
+    min_in_errors: int = None
+    max_in_errors: int = None
+    min_out_errors: int = None
+    max_out_errors: int = None
+    min_in_discards: int = None
+    max_in_discards: int = None
+    min_out_discards: int = None
+    max_out_discards: int = None
+
+
+@app.post("/interface_down_pkt_drop")
+async def interface_down_packet_drop(req: PacketDropCorrelationRequest, res: Response):
+
+    params, filters = load_params(req)
+
+    # Join the filters using AND
+    where_clause = " AND ".join(filters)
+    if where_clause:
+        where_clause = "AND " + where_clause
+
+    # Step 3: Query packet errors and flows from interfacestats and ts_flow
+    query = f"""
+    SELECT
+      d.switchip AS device_ip_address,
+      i.in_errors,
+      i.in_discards,
+      i.out_errors,
+      i.out_discards,
+      i.ifname,
+      t.src_addr,
+      t.dst_addr,
+      t.time AS flow_time,
+      i.time AS interface_time
+    FROM
+      device d
+    INNER JOIN
+      interfacestats i
+      ON d.device_mac_address = i.device_mac_address
+    INNER JOIN
+      ts_flow t
+      ON d.switchip = t.sampler_address
+    WHERE
+      i.time >= :from_time  -- Using the converted timestamp
+      {where_clause}
+    ORDER BY
+      i.time;
+    """
+
+    correlated_data = pd.read_sql_query(query, conn, params=params)
+
+    if correlated_data.empty:
+        default_response = {
+            "device_ip_address": "0.0.0.0",  # Placeholder IP
+            "in_errors": 0,
+            "in_discards": 0,
+            "out_errors": 0,
+            "out_discards": 0,
+            "ifname": req.ifname or "unknown",  # Placeholder or interface provided in the request
+            "src_addr": "0.0.0.0",  # Placeholder source IP
+            "dst_addr": "0.0.0.0",  # Placeholder destination IP
+            "flow_time": str(datetime.now(timezone.utc)),  # Current timestamp or placeholder
+            "interface_time": str(datetime.now(timezone.utc))  # Current timestamp or placeholder
+        }
+        return [default_response]
+
+
+    logger.info(f"Correlated Packet Drop Data: {correlated_data}")
+
+    return correlated_data.to_dict(orient='records')
 class InsuranceClaimDetailsRequest(BaseModel):
  policy_number: str

@ -159,3 +324,82 @@ async def insurance_claim_details(req: InsuranceClaimDetailsRequest, res: Respon
    }

    return claim_details
+
+
+class FlowPacketErrorCorrelationRequest(BaseModel):
+    from_time: str = None  # Optional natural language timeframe
+    ifname: str = None     # Optional interface name filter
+    region: str = None     # Optional region filter
+    min_in_errors: int = None
+    max_in_errors: int = None
+    min_out_errors: int = None
+    max_out_errors: int = None
+    min_in_discards: int = None
+    max_in_discards: int = None
+    min_out_discards: int = None
+    max_out_discards: int = None
+
+@app.post("/packet_errors_impact_flow")
+async def packet_errors_impact_flow(req: FlowPacketErrorCorrelationRequest, res: Response):
+
+    params, filters = load_params(req)
+
+    # Join the filters using AND
+    where_clause = " AND ".join(filters)
+    if where_clause:
+        where_clause = "AND " + where_clause
+
+    # Step 3: Query the packet errors and flows, correlating by timestamps
+    query = f"""
+    SELECT
+      d.switchip AS device_ip_address,
+      i.in_errors,
+      i.in_discards,
+      i.out_errors,
+      i.out_discards,
+      i.ifname,
+      t.src_addr,
+      t.dst_addr,
+      t.src_port,
+      t.dst_port,
+      t.packets,
+      t.time AS flow_time,
+      i.time AS error_time
+    FROM
+      device d
+    INNER JOIN
+      interfacestats i
+      ON d.device_mac_address = i.device_mac_address
+    INNER JOIN
+      ts_flow t
+      ON d.switchip = t.sampler_address
+    WHERE
+      i.time >= :from_time
+      AND ABS(strftime('%s', t.time) - strftime('%s', i.time)) <= 300  -- Correlate within 5 minutes
+      {where_clause}
+    ORDER BY
+      i.time;
+    """
+
+    correlated_data = pd.read_sql_query(query, conn, params=params)
+
+    if correlated_data.empty:
+        default_response = {
+            "device_ip_address": "0.0.0.0",  # Placeholder IP
+            "in_errors": 0,
+            "in_discards": 0,
+            "out_errors": 0,
+            "out_discards": 0,
+            "ifname": req.ifname or "unknown",  # Placeholder or interface provided in the request
+            "src_addr": "0.0.0.0",  # Placeholder source IP
+            "dst_addr": "0.0.0.0",  # Placeholder destination IP
+            "src_port": 0,
+            "dst_port": 0,
+            "packets": 0,
+            "flow_time": str(datetime.now(timezone.utc)),  # Current timestamp or placeholder
+            "error_time": str(datetime.now(timezone.utc))  # Current timestamp or placeholder
+        }
+        return [default_response]
+
+    # Return the correlated data if found
+    return correlated_data.to_dict(orient='records')
--- a/model_server/app/network_data_generator.py
+++ b/model_server/app/network_data_generator.py
@ -0,0 +1,200 @@
+import pandas as pd
+import random
+from datetime import datetime, timedelta, timezone
+import re
+import logging
+from dateparser import parse
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+# Function to convert natural language time expressions to "X {time} ago" format
+def convert_to_ago_format(expression):
+    # Define patterns for different time units
+    time_units = {
+        r'seconds': 'seconds',
+        r'minutes': 'minutes',
+        r'mins': 'mins',
+        r'hrs': 'hrs',
+        r'hours': 'hours',
+        r'hour': 'hour',
+        r'hr': 'hour',
+        r'days': 'days',
+        r'day': 'day',
+        r'weeks': 'weeks',
+        r'week': 'week',
+        r'months': 'months',
+        r'month': 'month',
+        r'years': 'years',
+        r'yrs': 'years',
+        r'year': 'year',
+        r'yr': 'year',
+    }
+
+    # Iterate over each time unit and create regex for each phrase format
+    for pattern, unit in time_units.items():
+        # Handle "for the past X {unit}"
+        match = re.search(fr'(\d+) {pattern}', expression)
+        if match:
+            quantity = match.group(1)
+            return f"{quantity} {unit} ago"
+
+    # If the format is not recognized, return None or raise an error
+    return None
+
+
+# Function to generate random MAC addresses
+def random_mac():
+    return "AA:BB:CC:DD:EE:" + ':'.join([f"{random.randint(0, 255):02X}" for _ in range(2)])
+
+# Function to generate random IP addresses
+def random_ip():
+    return f"{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}"
+
+# Generate synthetic data for the device table
+def generate_device_data(conn, n=1000,):
+    device_data = {
+        'switchip': [random_ip() for _ in range(n)],
+        'hwsku': [f'HW{i+1}' for i in range(n)],
+        'hostname': [f'switch{i+1}' for i in range(n)],
+        'osversion': [f'v{i+1}' for i in range(n)],
+        'layer': ['L2' if i % 2 == 0 else 'L3' for i in range(n)],
+        'region': [random.choice(['US', 'EU', 'ASIA']) for _ in range(n)],
+        'uptime': [f'{random.randint(0, 10)} days {random.randint(0, 23)}:{random.randint(0, 59)}:{random.randint(0, 59)}' for _ in range(n)],
+        'device_mac_address': [random_mac() for _ in range(n)]
+    }
+    df = pd.DataFrame(device_data)
+    df.to_sql('device', conn, index=False)
+    return df
+
+# Generate synthetic data for the interfacestats table
+def generate_interface_stats_data(conn, device_df, n=1000):
+    interface_stats_data = []
+    for _ in range(n):
+        device_mac = random.choice(device_df['device_mac_address'])
+        ifname = random.choice(['eth0', 'eth1', 'eth2', 'eth3'])
+        time = datetime.now(timezone.utc) - timedelta(minutes=random.randint(0, 1440 * 5))  # random timestamps in the past 5 day
+        in_discards = random.randint(0, 1000)
+        in_errors = random.randint(0, 500)
+        out_discards = random.randint(0, 800)
+        out_errors = random.randint(0, 400)
+        in_octets = random.randint(1000, 100000)
+        out_octets = random.randint(1000, 100000)
+
+        interface_stats_data.append({
+            'device_mac_address': device_mac,
+            'ifname': ifname,
+            'time': time,
+            'in_discards': in_discards,
+            'in_errors': in_errors,
+            'out_discards': out_discards,
+            'out_errors': out_errors,
+            'in_octets': in_octets,
+            'out_octets': out_octets
+        })
+    df = pd.DataFrame(interface_stats_data)
+    df.to_sql('interfacestats', conn, index=False)
+    return
+
+# Generate synthetic data for the ts_flow table
+def generate_flow_data(conn, device_df, n=1000):
+    flow_data = []
+    for _ in range(n):
+        sampler_address = random.choice(device_df['switchip'])
+        proto = random.choice(['TCP', 'UDP'])
+        src_addr = random_ip()
+        dst_addr = random_ip()
+        src_port = random.randint(1024, 65535)
+        dst_port = random.randint(1024, 65535)
+        in_if = random.randint(1, 10)
+        out_if = random.randint(1, 10)
+        flow_start = int((datetime.now() - timedelta(days=random.randint(1, 30))).timestamp())
+        flow_end = int((datetime.now() - timedelta(days=random.randint(1, 30))).timestamp())
+        bytes_transferred = random.randint(1000, 100000)
+        packets = random.randint(1, 1000)
+        flow_time = datetime.now(timezone.utc) - timedelta(minutes=random.randint(0, 1440 * 5))  # random flow time
+
+        flow_data.append({
+            'sampler_address': sampler_address,
+            'proto': proto,
+            'src_addr': src_addr,
+            'dst_addr': dst_addr,
+            'src_port': src_port,
+            'dst_port': dst_port,
+            'in_if': in_if,
+            'out_if': out_if,
+            'flow_start': flow_start,
+            'flow_end': flow_end,
+            'bytes': bytes_transferred,
+            'packets': packets,
+            'time': flow_time
+        })
+    df = pd.DataFrame(flow_data)
+    df.to_sql('ts_flow', conn, index=False)
+    return
+
+def load_params(req):
+    # Step 1: Convert the from_time natural language string to a timestamp if provided
+    if req.from_time:
+        # Use `dateparser` to parse natural language timeframes
+        logger.info(f"{'* ' * 50}\n\nCaptured from time: {req.from_time}\n\n")
+        parsed_time = parse(req.from_time, settings={'RELATIVE_BASE': datetime.now()})
+        if not parsed_time:
+            conv_time = convert_to_ago_format(req.from_time)
+            if conv_time:
+                parsed_time = parse(conv_time, settings={'RELATIVE_BASE': datetime.now()})
+            else:
+                return {"error": "Invalid from_time format. Please provide a valid time description such as 'past 7 days' or 'since last month'."}
+        logger.info(f"\n\nConverted from time: {parsed_time}\n\n{'* ' * 50}\n\n")
+        from_time = parsed_time
+        logger.info(f"Using parsed from_time: {from_time}")
+    else:
+        # If no from_time is provided, use a default value (e.g., the past 7 days)
+        from_time = datetime.now() - timedelta(days=7)
+        logger.info(f"Using default from_time: {from_time}")
+
+    # Step 2: Build the dynamic SQL query based on the optional filters
+    filters = []
+    params = {"from_time": from_time}
+
+    if req.ifname:
+        filters.append("i.ifname = :ifname")
+        params["ifname"] = req.ifname
+
+    if req.region:
+        filters.append("d.region = :region")
+        params["region"] = req.region
+
+    if req.min_in_errors is not None:
+        filters.append("i.in_errors >= :min_in_errors")
+        params["min_in_errors"] = req.min_in_errors
+
+    if req.max_in_errors is not None:
+        filters.append("i.in_errors <= :max_in_errors")
+        params["max_in_errors"] = req.max_in_errors
+
+    if req.min_out_errors is not None:
+        filters.append("i.out_errors >= :min_out_errors")
+        params["min_out_errors"] = req.min_out_errors
+
+    if req.max_out_errors is not None:
+        filters.append("i.out_errors <= :max_out_errors")
+        params["max_out_errors"] = req.max_out_errors
+
+    if req.min_in_discards is not None:
+        filters.append("i.in_discards >= :min_in_discards")
+        params["min_in_discards"] = req.min_in_discards
+
+    if req.max_in_discards is not None:
+        filters.append("i.in_discards <= :max_in_discards")
+        params["max_in_discards"] = req.max_in_discards
+
+    if req.min_out_discards is not None:
+        filters.append("i.out_discards >= :min_out_discards")
+        params["min_out_discards"] = req.min_out_discards
+
+    if req.max_out_discards is not None:
+        filters.append("i.out_discards <= :max_out_discards")
+        params["max_out_discards"] = req.max_out_discards
+
+    return params, filters
--- a/model_server/requirements.txt
+++ b/model_server/requirements.txt
@ -4,3 +4,5 @@ sentence-transformers
 torch
 uvicorn
 gliner
+pandas
+dateparser