Merge pull request 'dev-v0.7.x' (#21) from dev-v0.7.x into main

Reviewed-on: https://bitfreedom.net/code/code/nomyo-ai/nomyo-router/pulls/21
2026-04-08 14:01:54 +02:00 · 2026-04-08 14:01:54 +02:00 · 07b80e654f
commit 07b80e654f
parent 0bf91a6dd0 a432a65396
3 changed files with 11 additions and 8 deletions
--- a/.forgejo/workflows/docker-publish-semantic.yml
+++ b/.forgejo/workflows/docker-publish-semantic.yml
@ -18,7 +18,6 @@ on:
 env:
  REGISTRY: bitfreedom.net
  IMAGE_NAME: ${{ github.repository }}
  CACHE_IMAGE: ${{ github.repository }}-buildcache-semantic
  DOCKER_BUILD_SUMMARY: "false"
 jobs:
@ -87,9 +86,9 @@ jobs:
          provenance: false
          build-args: |
            SEMANTIC_CACHE=true
-          tags: ${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-${{ matrix.arch }}
+          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-${{ matrix.arch }}
-          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:buildcache-semantic-${{ matrix.arch }}
+          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-semantic-${{ matrix.arch }}
-          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:buildcache-semantic-${{ matrix.arch }},mode=min
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-semantic-${{ matrix.arch }},mode=max
  merge:
    runs-on: docker-amd64
@ -145,6 +144,6 @@ jobs:
        run: |
          docker buildx imagetools create \
            $(jq -cr '.tags | map("-t " + .) | join(" ")' <<< "$DOCKER_METADATA_OUTPUT_JSON") \
-            ${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-amd64 \
+            ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-amd64 \
-            ${{ env.REGISTRY }}/${{ env.CACHE_IMAGE }}:platform-semantic-arm64
+            ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:semantic-platform-arm64
--- a/.forgejo/workflows/docker-publish.yml
+++ b/.forgejo/workflows/docker-publish.yml
@ -79,7 +79,7 @@ jobs:
          provenance: false
          tags: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:platform-${{ matrix.arch }}
          cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }}
-          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }},mode=min
+          cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache-${{ matrix.arch }},mode=max
  merge:
    runs-on: docker-amd64
--- a/router.py
+++ b/router.py
@ -2110,7 +2110,11 @@ async def chat_proxy(request: Request):
                        # Only cache when no max_tokens limit was set — otherwise
                        # finish_reason=length might just mean max_tokens was hit,
                        # not that the context window was exhausted.
-                        _req_max_tok = params.get("max_tokens") or params.get("max_completion_tokens") or params.get("num_predict")
+                        _req_max_tok = (
                            params.get("max_tokens") or params.get("max_completion_tokens") or params.get("num_predict")
                            if use_openai else
                            (options.get("num_predict") if options else None)
                        )
                        if _dr == "length" and not _req_max_tok:
                            _pt = getattr(chunk, "prompt_eval_count", 0) or 0
                            _ct = getattr(chunk, "eval_count", 0) or 0