Python fp and docs updtes (#58)

* refactor: Update comments for clarity and add expectations.json files for performance metrics

* feat: Implement FP guard for JS/TS local-collection receivers to suppress missing ownership checks

* feat: Enhance Rust parameter handling to classify local collections and prevent false ownership checks

* refactor: Simplify code formatting for better readability in multiple files

* refactor: Improve UTF-8 sequence length handling and enhance clarity in loop iteration

* feat: Update Java and Python patterns to include new security rules

* refactor: Improve comment clarity and consistency across multiple Rust files

* refactor: Simplify code formatting for improved readability in integration tests and module files

* refactor: Improve comment formatting and enhance clarity in assertions across multiple files
This commit is contained in:
Eli Peter 2026-04-29 19:53:34 -04:00 committed by GitHub
parent 4db0805de6
commit a438886217
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
291 changed files with 9485 additions and 3851 deletions

View file

@ -0,0 +1,47 @@
# Nyx CVE benchmark fixture.
#
# CVE: CVE-2025-69662
# Project: geopandas (geopandas/geopandas)
# License: BSD-3-Clause (https://github.com/geopandas/geopandas/blob/main/LICENSE.txt)
# Advisory: https://github.com/advisories/GHSA-6497-prx7-gpmq
# Patched: 6aa8ef14ffdee4ba1044349ab948e1a1fbfaf419 geopandas/io/sql.py:432-438
#
# Fix: replace the f-string-built Find_SRID probe with a
# bound-parameter SQLAlchemy text() statement; SQLAlchemy passes the
# values via the driver's parameter binding, so attacker-supplied
# identifiers can no longer break out of the literal context.
#
# Trims:
# - Same scaffolding trim as vulnerable.py — `.fetchone()[0]` (post-
# sink result extraction) removed.
# - Patched-fix simplification: the upstream fix nests
# `text(...).bindparams(...)` directly inside `connection.execute(...)`.
# The fixture lifts the bound-parameter clause into a local `stmt`
# so the `.bindparams` call is a top-level CFG node — without this
# reshape, cfg-unguarded-sink fires on the surrounding execute
# because the inlined sanitizer-in-arg shape is not yet recognised
# by the dominator-based guard check. The verbatim bytes of the
# `text(...).bindparams(...)` clause are preserved.
from flask import Flask, request
from sqlalchemy import create_engine, text
app = Flask(__name__)
engine = create_engine("postgresql://localhost/geo")
@app.post("/upload-layer")
def upload_layer():
body = request.get_json(force=True) or {}
geom_name = body.get("geom_name", "geom")
name = body.get("table", "data")
schema_name = body.get("schema", "public")
with engine.begin() as connection:
# Verbatim bytes from sql.py:433-437 — bound-parameter probe.
stmt = text(
"SELECT Find_SRID(:schema_name, :name, :geom_name);"
).bindparams(
schema_name=schema_name, name=name, geom_name=geom_name
)
connection.execute(stmt)
return {"ok": True}

View file

@ -0,0 +1,46 @@
# Nyx CVE benchmark fixture.
#
# CVE: CVE-2025-69662
# Project: geopandas (geopandas/geopandas)
# License: BSD-3-Clause (https://github.com/geopandas/geopandas/blob/main/LICENSE.txt)
# Advisory: https://github.com/advisories/GHSA-6497-prx7-gpmq
# Vulnerable: c301579e0ac4034c19bece63c08bf628613700b4 geopandas/io/sql.py:432-435
#
# geopandas.GeoDataFrame.to_postgis() concatenated the GeoDataFrame's
# geometry column name (and the schema/table names) into a Find_SRID
# probe via f-string. A user uploading a GeoDataFrame whose geometry
# column was named with embedded SQL (e.g. "geom'); DROP TABLE...--")
# achieved arbitrary SQL execution against the target Postgres database.
#
# Trims:
# - Surrounding to_postgis() body (CRS lookup, EWKB conversion, dtype
# dict construction at L399-422) that scaffolds the vulnerable
# Find_SRID probe.
# - Trailing `.fetchone()[0]` on the connection.execute(...) result —
# downstream of the sink (result extraction), not on the flow path.
#
# Only the source statement (geom_name from request input), the
# f-string SQL builder, and the connection.execute(text(...)) sink are
# preserved verbatim from sql.py:432-435.
from flask import Flask, request
from sqlalchemy import create_engine, text
app = Flask(__name__)
engine = create_engine("postgresql://localhost/geo")
@app.post("/upload-layer")
def upload_layer():
body = request.get_json(force=True) or {}
# geom_name is supplied by the API caller — no validation upstream.
geom_name = body.get("geom_name", "geom")
name = body.get("table", "data")
schema_name = body.get("schema", "public")
with engine.begin() as connection:
# Verbatim from sql.py:432-435 — Find_SRID probe with
# f-string-interpolated identifiers.
connection.execute(
text(f"SELECT Find_SRID('{schema_name}', '{name}', '{geom_name}');")
)
return {"ok": True}

View file

@ -0,0 +1,79 @@
# Nyx CVE benchmark fixture.
#
# CVE: CVE-2026-33626
# Project: LMDeploy (InternLM/lmdeploy)
# License: Apache-2.0 (https://github.com/InternLM/lmdeploy/blob/main/LICENSE)
# Advisory: https://github.com/advisories/GHSA-25c5-rg58-mhxh
# Patched: 71d64a339edb901e9005358e0633fbbab367d626 lmdeploy/vl/media/connection.py:24-69
#
# Fix: introduce `_is_safe_url(url)` which resolves the hostname via
# `socket.getaddrinfo`, walks every returned IP, and rejects any that
# aren't `is_global` (covers loopback, RFC1918 private, link-local,
# multicast, reserved, unspecified). The vulnerable scheme-only check
# is replaced by this allowlist gate before the fetch.
#
# Trims: same scaffolding trim as vulnerable.py — MediaIO generic
# plumbing replaced with a Flask handler; fetch_timeout env-var
# resolution collapsed to a literal. The `_is_safe_url` body, the
# replacement gate at L55-58, and the `client.get(...,
# allow_redirects=True)` fetch are preserved verbatim from the fix
# commit.
import ipaddress
import socket
from urllib.parse import urlparse
import requests
from flask import Flask, request
app = Flask(__name__)
headers = {"User-Agent": "Mozilla/5.0"}
def _is_safe_url(url: str) -> tuple[bool, str]:
"""Check if the URL is safe to fetch (not internal/private)."""
try:
parsed = urlparse(url)
if parsed.scheme not in ('http', 'https'):
return False, f'Unsupported scheme: {parsed.scheme}'
hostname = parsed.hostname
if not hostname:
return False, 'Could not parse hostname from URL'
# check all IPs (IPv4 + IPv6) using getaddrinfo
try:
infos = socket.getaddrinfo(hostname, None)
except socket.gaierror:
return False, 'Hostname resolution failed'
for info in infos:
ip = ipaddress.ip_address(info[4][0])
# block any IP that is not globally routable
if not ip.is_global:
return False, f'Blocked non-global IP detected: {ip}'
return True, 'URL is safe'
except Exception as e:
return False, f'URL validation failed: {str(e)}'
@app.post("/load-image")
def load_image():
body = request.get_json(force=True) or {}
url = body.get("url", "")
url_spec = urlparse(url)
# Verbatim from connection.py:55-58 — replaces the scheme-only
# check with a private-IP-blocking allowlist.
is_safe, reason = _is_safe_url(url_spec.geturl())
if not is_safe:
raise ValueError(f'URL is blocked for security reasons: {reason}')
fetch_timeout = 10
client = requests.Session()
client.max_redirects = 3
response = client.get(
url_spec.geturl(), headers=headers, timeout=fetch_timeout, allow_redirects=True
)
response.raise_for_status()
return {"size": len(response.content)}

View file

@ -0,0 +1,51 @@
# Nyx CVE benchmark fixture.
#
# CVE: CVE-2026-33626
# Project: LMDeploy (InternLM/lmdeploy)
# License: Apache-2.0 (https://github.com/InternLM/lmdeploy/blob/main/LICENSE)
# Advisory: https://github.com/advisories/GHSA-25c5-rg58-mhxh
# Vulnerable: 819a80836e991ca3f427b0e85faca159083d3d40 lmdeploy/vl/media/connection.py:23-37
#
# LMDeploy's vision-language image loader accepted user-supplied
# image URLs from the chat-completion request and fetched them via
# `requests.Session().get(url)` after only a scheme check. Attackers
# embedded URLs pointing at internal network services or cloud
# metadata endpoints (e.g. http://169.254.169.254/...) and exfiltrated
# the response back through the model output.
#
# Trims:
# - Surrounding _load_data_url / file-URL branches that don't reach
# the HTTP sink (lines 41+).
# - The scheme-only allowlist check at L24-25 of upstream. The
# CVE is host-based SSRF (private IP / cloud-metadata host); the
# scheme check was the insufficient validation the fix replaces.
# Removing it keeps the load-bearing source → sink flow intact.
# - The fetch_timeout env-var resolution (L28-31) — collapsed to a
# literal so the fixture is self-contained.
# - MediaIO[_M] generic plumbing — replaced with a Flask handler so
# the source is a concrete request flow.
#
# The verbatim load-bearing lines are the `client = requests.Session()`
# constructor and the `client.get(url_spec.geturl(), headers=headers,
# timeout=fetch_timeout)` fetch site at lines 33-34 of upstream.
from urllib.parse import urlparse
import requests
from flask import Flask, request
app = Flask(__name__)
headers = {"User-Agent": "Mozilla/5.0"}
@app.post("/load-image")
def load_image():
body = request.get_json(force=True) or {}
url = body.get("url", "")
url_spec = urlparse(url)
fetch_timeout = 10
# Verbatim from connection.py:33-34 — Session().get(url).
client = requests.Session()
response = client.get(url_spec.geturl(), headers=headers, timeout=fetch_timeout)
response.raise_for_status()
return {"size": len(response.content)}