Added chunk_size metrics, and added metrics to dashboard (#16)

This commit is contained in:
cybermaggedon 2024-08-22 00:20:24 +01:00 committed by GitHub
parent d3cdb97528
commit 0043b871ff
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 106 additions and 5 deletions

View file

@ -408,6 +408,88 @@
"title": "Pub/sub backlog",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "f6b18033-5918-4e05-a1ca-4cb30343b129"
},
"fieldConfig": {
"defaults": {
"custom": {
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"scaleDistribution": {
"type": "linear"
}
}
},
"overrides": []
},
"gridPos": {
"h": 6,
"w": 12,
"x": 0,
"y": 17
},
"id": 9,
"options": {
"calculate": false,
"cellGap": 1,
"color": {
"exponent": 0.5,
"fill": "dark-orange",
"mode": "scheme",
"reverse": false,
"scale": "exponential",
"scheme": "Greys",
"steps": 64
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"filterValues": {
"le": 1e-9
},
"legend": {
"show": true
},
"rowsFrame": {
"layout": "auto"
},
"tooltip": {
"show": true,
"yHistogram": false
},
"yAxis": {
"axisPlacement": "left",
"reverse": false
}
},
"pluginVersion": "10.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "f6b18033-5918-4e05-a1ca-4cb30343b129"
},
"editorMode": "builder",
"exemplar": false,
"expr": "rate(chunk_size_bucket[$__rate_interval])",
"format": "heatmap",
"hide": false,
"instant": false,
"interval": "",
"legendFormat": "{{le}}",
"range": true,
"refId": "A"
}
],
"title": "Chunk size",
"type": "heatmap"
},
{
"datasource": {
"type": "prometheus",
@ -417,7 +499,7 @@
"h": 11,
"w": 24,
"x": 0,
"y": 17
"y": 23
},
"id": 8,
"targets": [
@ -438,7 +520,7 @@
"type": "nodeGraph"
}
],
"refresh": "5m",
"refresh": "5s",
"schemaVersion": 38,
"style": "dark",
"tags": [],
@ -446,13 +528,13 @@
"list": []
},
"time": {
"from": "now-30m",
"from": "now-15m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "Overview",
"uid": "b5c8abf8-fe79-496b-b028-10bde917d1f0",
"version": 3,
"version": 2,
"weekStart": ""
}

View file

@ -5,7 +5,7 @@ as text as separate output objects.
"""
from langchain_text_splitters import RecursiveCharacterTextSplitter
from prometheus_client import Histogram
from ... schema import TextDocument, Chunk, Source
from ... schema import text_ingest_queue, chunk_ingest_queue
@ -38,6 +38,13 @@ class Processor(ConsumerProducer):
}
)
if not hasattr(__class__, "chunk_metric"):
__class__.chunk_metric = Histogram(
'chunk_size', 'Chunk size',
buckets=[100, 160, 250, 400, 650, 1000, 1600,
2500, 4000, 6400, 10000, 16000]
)
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
@ -67,6 +74,8 @@ class Processor(ConsumerProducer):
chunk=chunk.page_content.encode("utf-8"),
)
__class__.chunk_metric.observe(len(chunk.page_content))
self.send(r)
print("Done.", flush=True)

View file

@ -5,6 +5,7 @@ as text as separate output objects.
"""
from langchain_text_splitters import TokenTextSplitter
from prometheus_client import Histogram
from ... schema import TextDocument, Chunk, Source
from ... schema import text_ingest_queue, chunk_ingest_queue
@ -37,6 +38,13 @@ class Processor(ConsumerProducer):
}
)
if not hasattr(__class__, "chunk_metric"):
__class__.chunk_metric = Histogram(
'chunk_size', 'Chunk size',
buckets=[100, 160, 250, 400, 650, 1000, 1600,
2500, 4000, 6400, 10000, 16000]
)
self.text_splitter = TokenTextSplitter(
encoding_name="cl100k_base",
chunk_size=chunk_size,
@ -65,6 +73,8 @@ class Processor(ConsumerProducer):
chunk=chunk.page_content.encode("utf-8"),
)
__class__.chunk_metric.observe(len(chunk.page_content))
self.send(r)
print("Done.", flush=True)