[Kan-103] add support toxic/jailbreak model (#49)

* add toxic/jailbreak model

* fix path loading model

* fix syntax

* fix bug,lint, format

* fix bug

* formatting

* add parallel + chunking

* fix bug

* working version

* fix onnnx name erorr

* device

* fix jailbreak config

* fix syntax error

* format

* add requirement + cli download for dockerfile

* add task

* add skeleton change for envoy filter for prompt guard

* fix hardware config

* fix bug

* add config changes

* add gitignore

* merge main

* integrate arch-guard with filter

* add hardware config

* nothing

* add hardware config feature

* fix requirement

* fix chat ui

* fix onnx

* fix lint

* remove non intel cpu

* remove onnx

* working version

* modify docker

* fix guard time

* add nvidia support

* remove nvidia

* add gpu

* add gpu

* add gpu support

* add gpu support for compose

* add gpu support for compose

* add gpu support for compose

* add gpu support for compose

* add gpu support for compose

* fix docker file

* fix int test

* correct gpu docker

* upgrad python 10

* fix logits to be gpu compatible

* default to cpu dockerfile

* resolve comments

* fix lint + unused parameters

* fix

* remove eetq install for cpu

* remove deploy gpu

---------

Co-authored-by: Adil Hafeez <adil@katanemo.com>
This commit is contained in:
Co Tran 2024-09-23 12:07:31 -07:00 committed by GitHub
parent 80c554ce1a
commit 79b1c5415f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 1622 additions and 191 deletions

View file

@ -5,36 +5,39 @@ import re
import logging
from dateparser import parse
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)
# Function to convert natural language time expressions to "X {time} ago" format
def convert_to_ago_format(expression):
# Define patterns for different time units
time_units = {
r'seconds': 'seconds',
r'minutes': 'minutes',
r'mins': 'mins',
r'hrs': 'hrs',
r'hours': 'hours',
r'hour': 'hour',
r'hr': 'hour',
r'days': 'days',
r'day': 'day',
r'weeks': 'weeks',
r'week': 'week',
r'months': 'months',
r'month': 'month',
r'years': 'years',
r'yrs': 'years',
r'year': 'year',
r'yr': 'year',
r"seconds": "seconds",
r"minutes": "minutes",
r"mins": "mins",
r"hrs": "hrs",
r"hours": "hours",
r"hour": "hour",
r"hr": "hour",
r"days": "days",
r"day": "day",
r"weeks": "weeks",
r"week": "week",
r"months": "months",
r"month": "month",
r"years": "years",
r"yrs": "years",
r"year": "year",
r"yr": "year",
}
# Iterate over each time unit and create regex for each phrase format
for pattern, unit in time_units.items():
# Handle "for the past X {unit}"
match = re.search(fr'(\d+) {pattern}', expression)
match = re.search(rf"(\d+) {pattern}", expression)
if match:
quantity = match.group(1)
return f"{quantity} {unit} ago"
@ -45,35 +48,48 @@ def convert_to_ago_format(expression):
# Function to generate random MAC addresses
def random_mac():
return "AA:BB:CC:DD:EE:" + ':'.join([f"{random.randint(0, 255):02X}" for _ in range(2)])
return "AA:BB:CC:DD:EE:" + ":".join(
[f"{random.randint(0, 255):02X}" for _ in range(2)]
)
# Function to generate random IP addresses
def random_ip():
return f"{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}.{random.randint(1, 255)}"
# Generate synthetic data for the device table
def generate_device_data(conn, n=1000,):
def generate_device_data(
conn,
n=1000,
):
device_data = {
'switchip': [random_ip() for _ in range(n)],
'hwsku': [f'HW{i+1}' for i in range(n)],
'hostname': [f'switch{i+1}' for i in range(n)],
'osversion': [f'v{i+1}' for i in range(n)],
'layer': ['L2' if i % 2 == 0 else 'L3' for i in range(n)],
'region': [random.choice(['US', 'EU', 'ASIA']) for _ in range(n)],
'uptime': [f'{random.randint(0, 10)} days {random.randint(0, 23)}:{random.randint(0, 59)}:{random.randint(0, 59)}' for _ in range(n)],
'device_mac_address': [random_mac() for _ in range(n)]
"switchip": [random_ip() for _ in range(n)],
"hwsku": [f"HW{i+1}" for i in range(n)],
"hostname": [f"switch{i+1}" for i in range(n)],
"osversion": [f"v{i+1}" for i in range(n)],
"layer": ["L2" if i % 2 == 0 else "L3" for i in range(n)],
"region": [random.choice(["US", "EU", "ASIA"]) for _ in range(n)],
"uptime": [
f"{random.randint(0, 10)} days {random.randint(0, 23)}:{random.randint(0, 59)}:{random.randint(0, 59)}"
for _ in range(n)
],
"device_mac_address": [random_mac() for _ in range(n)],
}
df = pd.DataFrame(device_data)
df.to_sql('device', conn, index=False)
df.to_sql("device", conn, index=False)
return df
# Generate synthetic data for the interfacestats table
def generate_interface_stats_data(conn, device_df, n=1000):
interface_stats_data = []
for _ in range(n):
device_mac = random.choice(device_df['device_mac_address'])
ifname = random.choice(['eth0', 'eth1', 'eth2', 'eth3'])
time = datetime.now(timezone.utc) - timedelta(minutes=random.randint(0, 1440 * 5)) # random timestamps in the past 5 day
device_mac = random.choice(device_df["device_mac_address"])
ifname = random.choice(["eth0", "eth1", "eth2", "eth3"])
time = datetime.now(timezone.utc) - timedelta(
minutes=random.randint(0, 1440 * 5)
) # random timestamps in the past 5 day
in_discards = random.randint(0, 1000)
in_errors = random.randint(0, 500)
out_discards = random.randint(0, 800)
@ -81,70 +97,86 @@ def generate_interface_stats_data(conn, device_df, n=1000):
in_octets = random.randint(1000, 100000)
out_octets = random.randint(1000, 100000)
interface_stats_data.append({
'device_mac_address': device_mac,
'ifname': ifname,
'time': time,
'in_discards': in_discards,
'in_errors': in_errors,
'out_discards': out_discards,
'out_errors': out_errors,
'in_octets': in_octets,
'out_octets': out_octets
})
interface_stats_data.append(
{
"device_mac_address": device_mac,
"ifname": ifname,
"time": time,
"in_discards": in_discards,
"in_errors": in_errors,
"out_discards": out_discards,
"out_errors": out_errors,
"in_octets": in_octets,
"out_octets": out_octets,
}
)
df = pd.DataFrame(interface_stats_data)
df.to_sql('interfacestats', conn, index=False)
df.to_sql("interfacestats", conn, index=False)
return
# Generate synthetic data for the ts_flow table
def generate_flow_data(conn, device_df, n=1000):
flow_data = []
for _ in range(n):
sampler_address = random.choice(device_df['switchip'])
proto = random.choice(['TCP', 'UDP'])
sampler_address = random.choice(device_df["switchip"])
proto = random.choice(["TCP", "UDP"])
src_addr = random_ip()
dst_addr = random_ip()
src_port = random.randint(1024, 65535)
dst_port = random.randint(1024, 65535)
in_if = random.randint(1, 10)
out_if = random.randint(1, 10)
flow_start = int((datetime.now() - timedelta(days=random.randint(1, 30))).timestamp())
flow_end = int((datetime.now() - timedelta(days=random.randint(1, 30))).timestamp())
flow_start = int(
(datetime.now() - timedelta(days=random.randint(1, 30))).timestamp()
)
flow_end = int(
(datetime.now() - timedelta(days=random.randint(1, 30))).timestamp()
)
bytes_transferred = random.randint(1000, 100000)
packets = random.randint(1, 1000)
flow_time = datetime.now(timezone.utc) - timedelta(minutes=random.randint(0, 1440 * 5)) # random flow time
flow_time = datetime.now(timezone.utc) - timedelta(
minutes=random.randint(0, 1440 * 5)
) # random flow time
flow_data.append({
'sampler_address': sampler_address,
'proto': proto,
'src_addr': src_addr,
'dst_addr': dst_addr,
'src_port': src_port,
'dst_port': dst_port,
'in_if': in_if,
'out_if': out_if,
'flow_start': flow_start,
'flow_end': flow_end,
'bytes': bytes_transferred,
'packets': packets,
'time': flow_time
})
flow_data.append(
{
"sampler_address": sampler_address,
"proto": proto,
"src_addr": src_addr,
"dst_addr": dst_addr,
"src_port": src_port,
"dst_port": dst_port,
"in_if": in_if,
"out_if": out_if,
"flow_start": flow_start,
"flow_end": flow_end,
"bytes": bytes_transferred,
"packets": packets,
"time": flow_time,
}
)
df = pd.DataFrame(flow_data)
df.to_sql('ts_flow', conn, index=False)
df.to_sql("ts_flow", conn, index=False)
return
def load_params(req):
# Step 1: Convert the from_time natural language string to a timestamp if provided
if req.from_time:
# Use `dateparser` to parse natural language timeframes
logger.info(f"{'* ' * 50}\n\nCaptured from time: {req.from_time}\n\n")
parsed_time = parse(req.from_time, settings={'RELATIVE_BASE': datetime.now()})
parsed_time = parse(req.from_time, settings={"RELATIVE_BASE": datetime.now()})
if not parsed_time:
conv_time = convert_to_ago_format(req.from_time)
if conv_time:
parsed_time = parse(conv_time, settings={'RELATIVE_BASE': datetime.now()})
parsed_time = parse(
conv_time, settings={"RELATIVE_BASE": datetime.now()}
)
else:
return {"error": "Invalid from_time format. Please provide a valid time description such as 'past 7 days' or 'since last month'."}
return {
"error": "Invalid from_time format. Please provide a valid time description such as 'past 7 days' or 'since last month'."
}
logger.info(f"\n\nConverted from time: {parsed_time}\n\n{'* ' * 50}\n\n")
from_time = parsed_time
logger.info(f"Using parsed from_time: {from_time}")