Performance and precision pass (#64)

2026-06-15 20:05:13 +02:00 · 2026-05-04 19:58:04 -04:00 · 2026-05-04 19:58:04 -04:00 · fb698d2c27
commit fb698d2c27
parent c7c5e0f3a1
97 changed files with 9932 additions and 517 deletions
--- a/tests/benchmark/cve_corpus/python/CVE-2023-6568/patched.py
+++ b/tests/benchmark/cve_corpus/python/CVE-2023-6568/patched.py
@ -0,0 +1,30 @@
+# Nyx CVE benchmark fixture.
+#
+# CVE:        CVE-2023-6568
+# Project:    MLflow (mlflow/mlflow)
+# License:    Apache-2.0  (https://github.com/mlflow/mlflow/blob/master/LICENSE.txt)
+# Advisory:   https://nvd.nist.gov/vuln/detail/CVE-2023-6568
+# Patched:    28ff3f94994941e038f2172c6484b65dc4db6ca1 mlflow/server/auth/__init__.py:744-770
+#
+# The fix replaces the f-string interpolation of the attacker-controlled
+# `content_type` header with a static error message.  No tainted value
+# reaches `make_response`, so the reflected-XSS sink is silent.
+
+from flask import request, make_response
+
+
+def catch_mlflow_exception(fn):
+    return fn
+
+
+@catch_mlflow_exception
+def create_user():
+    content_type = request.headers.get("Content-Type")
+    if content_type == "application/json":
+        return make_response({"user": "ok"})
+    else:
+        message = (
+            "Invalid content type. Must be one of: "
+            "application/x-www-form-urlencoded, application/json"
+        )
+        return make_response(message, 400)
--- a/tests/benchmark/cve_corpus/python/CVE-2023-6568/vulnerable.py
+++ b/tests/benchmark/cve_corpus/python/CVE-2023-6568/vulnerable.py
@ -0,0 +1,45 @@
+# Nyx CVE benchmark fixture.
+#
+# CVE:        CVE-2023-6568
+# Project:    MLflow (mlflow/mlflow)
+# License:    Apache-2.0  (https://github.com/mlflow/mlflow/blob/master/LICENSE.txt)
+# Advisory:   https://nvd.nist.gov/vuln/detail/CVE-2023-6568
+# Vulnerable: 28ff3f94994941e038f2172c6484b65dc4db6ca1~1 mlflow/server/auth/__init__.py:744-766
+#
+# Reflected Cross-Site Scripting in MLflow's auth server `create_user`
+# handler.  When a request arrived with an unrecognised `Content-Type`
+# header, the handler reflected the attacker-controlled header value
+# into a Flask response via an f-string and `make_response(...)`.
+# Because `make_response` returns the response unmodified (no escaping)
+# and Werkzeug serves the bytes back to the browser as text/html, the
+# header reflection becomes XSS in the browser.
+#
+# Trims:
+#   - imports / module-level setup (config, store, blueprints L1-30) —
+#     scaffolding only.
+#   - non-`create_user` handlers (`get_user`, `update_user_password`,
+#     `update_user_admin`, all later in the file) — same `make_response`
+#     call shape but with non-tainted inputs; not the disclosed sink.
+#   - `flash` / `alert` paths inside `create_user` (form-urlencoded and
+#     application/json branches) — those branches do not produce the
+#     reflected XSS; only the `else` branch does.
+#
+# Verbatim load-bearing lines: `content_type = request.headers.get(
+# "Content-Type")` (source) and `return make_response(f"Invalid content
+# type: '{content_type}'", 400)` (sink) are byte-for-byte from
+# mlflow/server/auth/__init__.py at the pre-fix SHA.
+
+from flask import request, make_response
+
+
+def catch_mlflow_exception(fn):
+    return fn
+
+
+@catch_mlflow_exception
+def create_user():
+    content_type = request.headers.get("Content-Type")
+    if content_type == "application/json":
+        return make_response({"user": "ok"})
+    else:
+        return make_response(f"Invalid content type: '{content_type}'", 400)
--- a/tests/benchmark/cve_corpus/python/CVE-2024-21513/patched.py
+++ b/tests/benchmark/cve_corpus/python/CVE-2024-21513/patched.py
@ -0,0 +1,26 @@
+# Nyx CVE benchmark fixture.
+#
+# CVE:        CVE-2024-21513
+# Project:    LangChain Experimental (langchain-ai/langchain)
+# License:    MIT  (https://github.com/langchain-ai/langchain/blob/master/LICENSE)
+# Advisory:   https://nvd.nist.gov/vuln/detail/CVE-2024-21513
+# Patched:    7b13292e3544b2f5f2bfb8a27a062ea2b0c34561
+#             libs/experimental/langchain_experimental/sql/vector_sql.py:79-83
+#
+# The fix removes the `_try_eval` helper entirely and returns the raw
+# `db._execute(...)` result without invoking `eval(...)` at all.  No
+# `eval` sink remains, so `py.code_exec.eval` is silent.
+
+from typing import Any, Dict, List, Union
+
+
+class SQLDatabase:
+    def _execute(self, cmd: str, fetch: str = "all") -> Any:
+        ...
+
+
+def get_result_from_sqldb(
+    db: SQLDatabase, cmd: str
+) -> Union[str, List[Dict[str, Any]], Dict[str, Any]]:
+    result = db._execute(cmd, fetch="all")  # type: ignore
+    return result
--- a/tests/benchmark/cve_corpus/python/CVE-2024-21513/vulnerable.py
+++ b/tests/benchmark/cve_corpus/python/CVE-2024-21513/vulnerable.py
@ -0,0 +1,56 @@
+# Nyx CVE benchmark fixture.
+#
+# CVE:        CVE-2024-21513
+# Project:    LangChain Experimental (langchain-ai/langchain)
+# License:    MIT  (https://github.com/langchain-ai/langchain/blob/master/LICENSE)
+# Advisory:   https://nvd.nist.gov/vuln/detail/CVE-2024-21513
+# Vulnerable: 7b13292e3544b2f5f2bfb8a27a062ea2b0c34561~1
+#             libs/experimental/langchain_experimental/sql/vector_sql.py:79-98
+#
+# `langchain_experimental.sql.vector_sql.VectorSQLDatabaseChain` ran
+# every value returned from a SQL query through Python's built-in
+# `eval(...)` so that string-shaped numbers / lists were converted into
+# Python objects.  An attacker who could control the database content
+# (for example by writing into a vector store backing the chain) could
+# return a value such as `__import__("os").system("rm -rf /")` and the
+# chain would `eval` it, achieving arbitrary code execution on the
+# server hosting the chain.
+#
+# Trims:
+#   - imports / non-load-bearing module decls (L1-30 of upstream).
+#   - `parse(self, text: str)` output-parser method (L70-77) and the
+#     `VectorSQLDatabaseChain` class body (L101-200) — neither is on
+#     the disclosed source→sink path.
+#   - SQLAlchemy / SQLDatabase type hints simplified to `Any` to avoid
+#     pulling the upstream type chain into the fixture.
+#
+# Verbatim load-bearing lines: the `_try_eval` helper definition and
+# the two dict / list comprehensions inside `get_result_from_sqldb`
+# that call `_try_eval(v)` on each query-result value are
+# byte-for-byte from vector_sql.py at the pre-fix SHA.
+
+from typing import Any, Dict, List, Union
+
+
+class SQLDatabase:
+    def _execute(self, cmd: str, fetch: str = "all") -> Any:
+        ...
+
+
+def _try_eval(x: Any) -> Any:
+    try:
+        return eval(x)
+    except Exception:
+        return x
+
+
+def get_result_from_sqldb(
+    db: SQLDatabase, cmd: str
+) -> Union[str, List[Dict[str, Any]], Dict[str, Any]]:
+    result = db._execute(cmd, fetch="all")  # type: ignore
+    if isinstance(result, list):
+        return [{k: _try_eval(v) for k, v in dict(d._asdict()).items()} for d in result]
+    else:
+        return {
+            k: _try_eval(v) for k, v in dict(result._asdict()).items()  # type: ignore
+        }
--- a/tests/benchmark/cve_corpus/python/CVE-2024-23334/patched.py
+++ b/tests/benchmark/cve_corpus/python/CVE-2024-23334/patched.py
@ -0,0 +1,57 @@
+# Nyx CVE benchmark fixture.
+#
+# CVE:        CVE-2024-23334
+# Project:    aiohttp (aio-libs/aiohttp)
+# License:    Apache-2.0  (https://github.com/aio-libs/aiohttp/blob/master/LICENSE.txt)
+# Advisory:   https://github.com/aio-libs/aiohttp/security/advisories/GHSA-5h86-8mv2-jq9f
+# Patched:    1c335944d6a8b1298baf179b7c0b3069f10c514b aiohttp/web_urldispatcher.py:644-668
+#
+# The fix splits the previously-unified resolve+containment check so
+# that ``relative_to(self._directory)`` is run on *both* arms of the
+# ``follow_symlinks`` branch.  In the follow-symlinks arm the path is
+# normalised pre-resolve so a symlink target that lives outside the
+# static directory still raises ``ValueError`` from ``relative_to`` and
+# is converted to ``HTTPNotFound``.
+#
+# Trims: same as vulnerable.py.
+#
+# Verbatim load-bearing lines: the rebuilt ``follow_symlinks`` branch
+# in ``_handle`` (L644-660), the new ``unresolved_path = self._directory
+# .joinpath(filename)`` step, and the ``normalized_path.relative_to(
+# self._directory)`` guard are byte-for-byte from
+# web_urldispatcher.py:644-660 of the fix commit.
+
+import os
+from pathlib import Path
+
+from aiohttp import web
+from aiohttp.web import FileResponse, HTTPForbidden, HTTPNotFound, Request, StreamResponse
+
+
+class StaticResource:
+    def __init__(self, directory: str, follow_symlinks: bool = True) -> None:
+        self._directory = Path(directory)
+        self._follow_symlinks = follow_symlinks
+        self._chunk_size = 256 * 1024
+
+    async def _handle(self, request: Request) -> StreamResponse:
+        rel_url = request.match_info["filename"]
+        try:
+            filename = Path(rel_url)
+            if filename.anchor:
+                raise HTTPForbidden()
+            unresolved_path = self._directory.joinpath(filename)
+            if self._follow_symlinks:
+                normalized_path = Path(os.path.normpath(unresolved_path))
+                normalized_path.relative_to(self._directory)
+                filepath = normalized_path.resolve()
+            else:
+                filepath = unresolved_path.resolve()
+                filepath.relative_to(self._directory)
+        except (ValueError, FileNotFoundError) as error:
+            raise HTTPNotFound() from error
+        except HTTPForbidden:
+            raise
+        if filepath.is_file():
+            return FileResponse(filepath, chunk_size=self._chunk_size)
+        raise HTTPNotFound
--- a/tests/benchmark/cve_corpus/python/CVE-2024-23334/vulnerable.py
+++ b/tests/benchmark/cve_corpus/python/CVE-2024-23334/vulnerable.py
@ -0,0 +1,62 @@
+# Nyx CVE benchmark fixture.
+#
+# CVE:        CVE-2024-23334
+# Project:    aiohttp (aio-libs/aiohttp)
+# License:    Apache-2.0  (https://github.com/aio-libs/aiohttp/blob/master/LICENSE.txt)
+# Advisory:   https://github.com/aio-libs/aiohttp/security/advisories/GHSA-5h86-8mv2-jq9f
+# Vulnerable: 33ccdfb0a12690af5bb49bda2319ec0907fa7827 aiohttp/web_urldispatcher.py:633-648
+#
+# aiohttp's StaticResource._handle resolved the requested filename
+# under the configured static directory and then verified containment
+# only when ``follow_symlinks`` was False.  When ``follow_symlinks=True``
+# the ``filepath.relative_to(self._directory)`` check was skipped, so a
+# symlink (or absolute path slip past the anchor check) under the
+# static directory could escape it and serve files from anywhere on
+# the filesystem the worker process could read.
+#
+# Trims:
+#   - ``append_version`` branch (L575-588) — separate code path that
+#     does not feed FileResponse on the disclosed flow.
+#   - ``HTTPNotFound`` / ``Exception`` handling fall-through after the
+#     try block (L646-654 of upstream) — irrelevant to source→sink.
+#   - ``_directory_as_html`` directory-listing branch (L658-708) —
+#     only ``FileResponse`` is the disclosed sink path.
+#
+# Verbatim load-bearing lines: the ``rel_url = request.match_info[
+# "filename"]`` source, the ``filepath = self._directory.joinpath(
+# filename).resolve()`` path composition, the missing ``relative_to``
+# guard inside the ``if not self._follow_symlinks`` branch, and the
+# ``return FileResponse(filepath, chunk_size=self._chunk_size)`` sink
+# are byte-for-byte from web_urldispatcher.py:633-648 and L666-668.
+
+from pathlib import Path
+
+from aiohttp import web
+from aiohttp.web import FileResponse, HTTPForbidden, HTTPNotFound, Request, StreamResponse
+
+
+class StaticResource:
+    def __init__(self, directory: str, follow_symlinks: bool = True) -> None:
+        self._directory = Path(directory)
+        self._follow_symlinks = follow_symlinks
+        self._chunk_size = 256 * 1024
+
+    async def _handle(self, request: Request) -> StreamResponse:
+        rel_url = request.match_info["filename"]
+        try:
+            filename = Path(rel_url)
+            if filename.anchor:
+                # rel_url is an absolute name like
+                # /static/\\machine_name\c$ or /static/D:\path
+                # where the static dir is totally different
+                raise HTTPForbidden()
+            filepath = self._directory.joinpath(filename).resolve()
+            if not self._follow_symlinks:
+                filepath.relative_to(self._directory)
+        except (ValueError, FileNotFoundError) as error:
+            raise HTTPNotFound() from error
+        except HTTPForbidden:
+            raise
+        if filepath.is_file():
+            return FileResponse(filepath, chunk_size=self._chunk_size)
+        raise HTTPNotFound