plano/model_server/app/network_data_generator.py

import pandas as pd
import random
from datetime import datetime, timedelta, timezone
import re
import logging
from dateparser import parse

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Function to convert natural language time expressions to "X {time} ago" format
def convert_to_ago_format(expression):
    # Define patterns for different time units
    time_units = {
        r'seconds': 'seconds',
        r'minutes': 'minutes',
        r'mins': 'mins',
        r'hrs': 'hrs',
        r'hours': 'hours',
        r'hour': 'hour',
        r'hr': 'hour',
        r'days': 'days',
        r'day': 'day',
        r'weeks': 'weeks',
        r'week': 'week',
        r'months': 'months',
        r'month': 'month',
        r'years': 'years',
        r'yrs': 'years',
        r'year': 'year',
        r'yr': 'year',
    }

    # Iterate over each time unit and create regex for each phrase format
    for pattern, unit in time_units.items():
        # Handle "for the past X {unit}"
        match = re.search(fr'(\d+) {pattern}', expression)
        if match:
            quantity = match.group(1)
            return f"{quantity} {unit} ago"

    # If the format is not recognized, return None or raise an error
    return None


# Function to generate random MAC addresses
def random_mac():
    return "AA:BB:CC:DD:EE:" + ':'.join([f"{random.randint(0, 255):02X}" for _ in range(2)])

# Function to generate random IP addresses
def random_ip():
    return f"{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}"

# Generate synthetic data for the device table
def generate_device_data(conn, n=1000,):
    device_data = {
        'switchip': [random_ip() for _ in range(n)],
        'hwsku': [f'HW{i+1}' for i in range(n)],
        'hostname': [f'switch{i+1}' for i in range(n)],
        'osversion': [f'v{i+1}' for i in range(n)],
        'layer': ['L2' if i % 2 == 0 else 'L3' for i in range(n)],
        'region': [random.choice(['US', 'EU', 'ASIA']) for _ in range(n)],
        'uptime': [f'{random.randint(0, 10)} days {random.randint(0, 23)}:{random.randint(0, 59)}:{random.randint(0, 59)}' for _ in range(n)],
        'device_mac_address': [random_mac() for _ in range(n)]
    }
    df = pd.DataFrame(device_data)
    df.to_sql('device', conn, index=False)
    return df

# Generate synthetic data for the interfacestats table
def generate_interface_stats_data(conn, device_df, n=1000):
    interface_stats_data = []
    for _ in range(n):
        device_mac = random.choice(device_df['device_mac_address'])
        ifname = random.choice(['eth0', 'eth1', 'eth2', 'eth3'])
        time = datetime.now(timezone.utc) - timedelta(minutes=random.randint(0, 1440 * 5))  # random timestamps in the past 5 day
        in_discards = random.randint(0, 1000)
        in_errors = random.randint(0, 500)
        out_discards = random.randint(0, 800)
        out_errors = random.randint(0, 400)
        in_octets = random.randint(1000, 100000)
        out_octets = random.randint(1000, 100000)

        interface_stats_data.append({
            'device_mac_address': device_mac,
            'ifname': ifname,
            'time': time,
            'in_discards': in_discards,
            'in_errors': in_errors,
            'out_discards': out_discards,
            'out_errors': out_errors,
            'in_octets': in_octets,
            'out_octets': out_octets
        })
    df = pd.DataFrame(interface_stats_data)
    df.to_sql('interfacestats', conn, index=False)
    return

# Generate synthetic data for the ts_flow table
def generate_flow_data(conn, device_df, n=1000):
    flow_data = []
    for _ in range(n):
        sampler_address = random.choice(device_df['switchip'])
        proto = random.choice(['TCP', 'UDP'])
        src_addr = random_ip()
        dst_addr = random_ip()
        src_port = random.randint(1024, 65535)
        dst_port = random.randint(1024, 65535)
        in_if = random.randint(1, 10)
        out_if = random.randint(1, 10)
        flow_start = int((datetime.now() - timedelta(days=random.randint(1, 30))).timestamp())
        flow_end = int((datetime.now() - timedelta(days=random.randint(1, 30))).timestamp())
        bytes_transferred = random.randint(1000, 100000)
        packets = random.randint(1, 1000)
        flow_time = datetime.now(timezone.utc) - timedelta(minutes=random.randint(0, 1440 * 5))  # random flow time

        flow_data.append({
            'sampler_address': sampler_address,
            'proto': proto,
            'src_addr': src_addr,
            'dst_addr': dst_addr,
            'src_port': src_port,
            'dst_port': dst_port,
            'in_if': in_if,
            'out_if': out_if,
            'flow_start': flow_start,
            'flow_end': flow_end,
            'bytes': bytes_transferred,
            'packets': packets,
            'time': flow_time
        })
    df = pd.DataFrame(flow_data)
    df.to_sql('ts_flow', conn, index=False)
    return

def load_params(req):
    # Step 1: Convert the from_time natural language string to a timestamp if provided
    if req.from_time:
        # Use `dateparser` to parse natural language timeframes
        logger.info(f"{'* ' * 50}\n\nCaptured from time: {req.from_time}\n\n")
        parsed_time = parse(req.from_time, settings={'RELATIVE_BASE': datetime.now()})
        if not parsed_time:
            conv_time = convert_to_ago_format(req.from_time)
            if conv_time:
                parsed_time = parse(conv_time, settings={'RELATIVE_BASE': datetime.now()})
            else:
                return {"error": "Invalid from_time format. Please provide a valid time description such as 'past 7 days' or 'since last month'."}
        logger.info(f"\n\nConverted from time: {parsed_time}\n\n{'* ' * 50}\n\n")
        from_time = parsed_time
        logger.info(f"Using parsed from_time: {from_time}")
    else:
        # If no from_time is provided, use a default value (e.g., the past 7 days)
        from_time = datetime.now() - timedelta(days=7)
        logger.info(f"Using default from_time: {from_time}")

    # Step 2: Build the dynamic SQL query based on the optional filters
    filters = []
    params = {"from_time": from_time}

    if req.ifname:
        filters.append("i.ifname = :ifname")
        params["ifname"] = req.ifname

    if req.region:
        filters.append("d.region = :region")
        params["region"] = req.region

    if req.min_in_errors is not None:
        filters.append("i.in_errors >= :min_in_errors")
        params["min_in_errors"] = req.min_in_errors

    if req.max_in_errors is not None:
        filters.append("i.in_errors <= :max_in_errors")
        params["max_in_errors"] = req.max_in_errors

    if req.min_out_errors is not None:
        filters.append("i.out_errors >= :min_out_errors")
        params["min_out_errors"] = req.min_out_errors

    if req.max_out_errors is not None:
        filters.append("i.out_errors <= :max_out_errors")
        params["max_out_errors"] = req.max_out_errors

    if req.min_in_discards is not None:
        filters.append("i.in_discards >= :min_in_discards")
        params["min_in_discards"] = req.min_in_discards

    if req.max_in_discards is not None:
        filters.append("i.in_discards <= :max_in_discards")
        params["max_in_discards"] = req.max_in_discards

    if req.min_out_discards is not None:
        filters.append("i.out_discards >= :min_out_discards")
        params["min_out_discards"] = req.min_out_discards

    if req.max_out_discards is not None:
        filters.append("i.out_discards <= :max_out_discards")
        params["max_out_discards"] = req.max_out_discards

    return params, filters
demos for network copilot and sql analyzer (#57) * pulled from main branch after adding enums and made changes * added sql_analyzer folder and built a demo for Employee stats function calling. "top_employees" and "aggregate_stats". * sql_anayzer * After addressing PR comments * PR comments * PR comments * Addeed Network Analyzer FC Code * Added network Analyzer code for diff timeframes * Network Copilot and Employee Details demos are updated with their descriptions and resolved the PR comments * Added 2nd function in network copilot * Added 2nd function in network copilot * Added 2nd function in network copilot * Added 2nd function in network copilot * Added 2nd function in network copilot 2024-09-19 11:40:31 -07:00			`import pandas as pd`
			`import random`
			`from datetime import datetime, timedelta, timezone`
			`import re`
			`import logging`
			`from dateparser import parse`

			`logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')`
			`logger = logging.getLogger(__name__)`

			`# Function to convert natural language time expressions to "X {time} ago" format`
			`def convert_to_ago_format(expression):`
			`# Define patterns for different time units`
			`time_units = {`
			`r'seconds': 'seconds',`
			`r'minutes': 'minutes',`
			`r'mins': 'mins',`
			`r'hrs': 'hrs',`
			`r'hours': 'hours',`
			`r'hour': 'hour',`
			`r'hr': 'hour',`
			`r'days': 'days',`
			`r'day': 'day',`
			`r'weeks': 'weeks',`
			`r'week': 'week',`
			`r'months': 'months',`
			`r'month': 'month',`
			`r'years': 'years',`
			`r'yrs': 'years',`
			`r'year': 'year',`
			`r'yr': 'year',`
			`}`

			`# Iterate over each time unit and create regex for each phrase format`
			`for pattern, unit in time_units.items():`
			`# Handle "for the past X {unit}"`
			`match = re.search(fr'(\d+) {pattern}', expression)`
			`if match:`
			`quantity = match.group(1)`
			`return f"{quantity} {unit} ago"`

			`# If the format is not recognized, return None or raise an error`
			`return None`


			`# Function to generate random MAC addresses`
			`def random_mac():`
			`return "AA:BB:CC:DD:EE:" + ':'.join([f"{random.randint(0, 255):02X}" for _ in range(2)])`

			`# Function to generate random IP addresses`
			`def random_ip():`
			`return f"{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}"`

			`# Generate synthetic data for the device table`
			`def generate_device_data(conn, n=1000,):`
			`device_data = {`
			`'switchip': [random_ip() for _ in range(n)],`
			`'hwsku': [f'HW{i+1}' for i in range(n)],`
			`'hostname': [f'switch{i+1}' for i in range(n)],`
			`'osversion': [f'v{i+1}' for i in range(n)],`
			`'layer': ['L2' if i % 2 == 0 else 'L3' for i in range(n)],`
			`'region': [random.choice(['US', 'EU', 'ASIA']) for _ in range(n)],`
			`'uptime': [f'{random.randint(0, 10)} days {random.randint(0, 23)}:{random.randint(0, 59)}:{random.randint(0, 59)}' for _ in range(n)],`
			`'device_mac_address': [random_mac() for _ in range(n)]`
			`}`
			`df = pd.DataFrame(device_data)`
			`df.to_sql('device', conn, index=False)`
			`return df`

			`# Generate synthetic data for the interfacestats table`
			`def generate_interface_stats_data(conn, device_df, n=1000):`
			`interface_stats_data = []`
			`for _ in range(n):`
			`device_mac = random.choice(device_df['device_mac_address'])`
			`ifname = random.choice(['eth0', 'eth1', 'eth2', 'eth3'])`
			`time = datetime.now(timezone.utc) - timedelta(minutes=random.randint(0, 1440 * 5)) # random timestamps in the past 5 day`
			`in_discards = random.randint(0, 1000)`
			`in_errors = random.randint(0, 500)`
			`out_discards = random.randint(0, 800)`
			`out_errors = random.randint(0, 400)`
			`in_octets = random.randint(1000, 100000)`
			`out_octets = random.randint(1000, 100000)`

			`interface_stats_data.append({`
			`'device_mac_address': device_mac,`
			`'ifname': ifname,`
			`'time': time,`
			`'in_discards': in_discards,`
			`'in_errors': in_errors,`
			`'out_discards': out_discards,`
			`'out_errors': out_errors,`
			`'in_octets': in_octets,`
			`'out_octets': out_octets`
			`})`
			`df = pd.DataFrame(interface_stats_data)`
			`df.to_sql('interfacestats', conn, index=False)`
			`return`

			`# Generate synthetic data for the ts_flow table`
			`def generate_flow_data(conn, device_df, n=1000):`
			`flow_data = []`
			`for _ in range(n):`
			`sampler_address = random.choice(device_df['switchip'])`
			`proto = random.choice(['TCP', 'UDP'])`
			`src_addr = random_ip()`
			`dst_addr = random_ip()`
			`src_port = random.randint(1024, 65535)`
			`dst_port = random.randint(1024, 65535)`
			`in_if = random.randint(1, 10)`
			`out_if = random.randint(1, 10)`
			`flow_start = int((datetime.now() - timedelta(days=random.randint(1, 30))).timestamp())`
			`flow_end = int((datetime.now() - timedelta(days=random.randint(1, 30))).timestamp())`
			`bytes_transferred = random.randint(1000, 100000)`
			`packets = random.randint(1, 1000)`
			`flow_time = datetime.now(timezone.utc) - timedelta(minutes=random.randint(0, 1440 * 5)) # random flow time`

			`flow_data.append({`
			`'sampler_address': sampler_address,`
			`'proto': proto,`
			`'src_addr': src_addr,`
			`'dst_addr': dst_addr,`
			`'src_port': src_port,`
			`'dst_port': dst_port,`
			`'in_if': in_if,`
			`'out_if': out_if,`
			`'flow_start': flow_start,`
			`'flow_end': flow_end,`
			`'bytes': bytes_transferred,`
			`'packets': packets,`
			`'time': flow_time`
			`})`
			`df = pd.DataFrame(flow_data)`
			`df.to_sql('ts_flow', conn, index=False)`
			`return`

			`def load_params(req):`
			`# Step 1: Convert the from_time natural language string to a timestamp if provided`
			`if req.from_time:`
			# Use `dateparser` to parse natural language timeframes
			`logger.info(f"{'* ' * 50}\n\nCaptured from time: {req.from_time}\n\n")`
			`parsed_time = parse(req.from_time, settings={'RELATIVE_BASE': datetime.now()})`
			`if not parsed_time:`
			`conv_time = convert_to_ago_format(req.from_time)`
			`if conv_time:`
			`parsed_time = parse(conv_time, settings={'RELATIVE_BASE': datetime.now()})`
			`else:`
			`return {"error": "Invalid from_time format. Please provide a valid time description such as 'past 7 days' or 'since last month'."}`
			`logger.info(f"\n\nConverted from time: {parsed_time}\n\n{'* ' * 50}\n\n")`
			`from_time = parsed_time`
			`logger.info(f"Using parsed from_time: {from_time}")`
			`else:`
			`# If no from_time is provided, use a default value (e.g., the past 7 days)`
			`from_time = datetime.now() - timedelta(days=7)`
			`logger.info(f"Using default from_time: {from_time}")`

			`# Step 2: Build the dynamic SQL query based on the optional filters`
			`filters = []`
			`params = {"from_time": from_time}`

			`if req.ifname:`
			`filters.append("i.ifname = :ifname")`
			`params["ifname"] = req.ifname`

			`if req.region:`
			`filters.append("d.region = :region")`
			`params["region"] = req.region`

			`if req.min_in_errors is not None:`
			`filters.append("i.in_errors >= :min_in_errors")`
			`params["min_in_errors"] = req.min_in_errors`

			`if req.max_in_errors is not None:`
			`filters.append("i.in_errors <= :max_in_errors")`
			`params["max_in_errors"] = req.max_in_errors`

			`if req.min_out_errors is not None:`
			`filters.append("i.out_errors >= :min_out_errors")`
			`params["min_out_errors"] = req.min_out_errors`

			`if req.max_out_errors is not None:`
			`filters.append("i.out_errors <= :max_out_errors")`
			`params["max_out_errors"] = req.max_out_errors`

			`if req.min_in_discards is not None:`
			`filters.append("i.in_discards >= :min_in_discards")`
			`params["min_in_discards"] = req.min_in_discards`

			`if req.max_in_discards is not None:`
			`filters.append("i.in_discards <= :max_in_discards")`
			`params["max_in_discards"] = req.max_in_discards`

			`if req.min_out_discards is not None:`
			`filters.append("i.out_discards >= :min_out_discards")`
			`params["min_out_discards"] = req.min_out_discards`

			`if req.max_out_discards is not None:`
			`filters.append("i.out_discards <= :max_out_discards")`
			`params["max_out_discards"] = req.max_out_discards`

			`return params, filters`