diff --git a/grafana/dashboard.json b/grafana/dashboard.json index f1e76f72..4297d8c0 100644 --- a/grafana/dashboard.json +++ b/grafana/dashboard.json @@ -408,6 +408,88 @@ "title": "Pub/sub backlog", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "f6b18033-5918-4e05-a1ca-4cb30343b129" + }, + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 9, + "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Greys", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false + } + }, + "pluginVersion": "10.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "f6b18033-5918-4e05-a1ca-4cb30343b129" + }, + "editorMode": "builder", + "exemplar": false, + "expr": "rate(chunk_size_bucket[$__rate_interval])", + "format": "heatmap", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "range": true, + "refId": "A" + } + ], + "title": "Chunk size", + "type": "heatmap" + }, { "datasource": { "type": "prometheus", @@ -417,7 +499,7 @@ "h": 11, "w": 24, "x": 0, - "y": 17 + "y": 23 }, "id": 8, "targets": [ @@ -438,7 +520,7 @@ "type": "nodeGraph" } ], - "refresh": "5m", + "refresh": "5s", "schemaVersion": 38, "style": "dark", "tags": [], @@ -446,13 +528,13 @@ "list": [] }, "time": { - "from": "now-30m", + "from": "now-15m", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Overview", "uid": "b5c8abf8-fe79-496b-b028-10bde917d1f0", - "version": 3, + "version": 2, "weekStart": "" } diff --git a/trustgraph/chunking/recursive/chunker.py b/trustgraph/chunking/recursive/chunker.py index d38b580a..fe1a0cee 100755 --- a/trustgraph/chunking/recursive/chunker.py +++ b/trustgraph/chunking/recursive/chunker.py @@ -5,7 +5,7 @@ as text as separate output objects. """ from langchain_text_splitters import RecursiveCharacterTextSplitter - +from prometheus_client import Histogram from ... schema import TextDocument, Chunk, Source from ... schema import text_ingest_queue, chunk_ingest_queue @@ -38,6 +38,13 @@ class Processor(ConsumerProducer): } ) + if not hasattr(__class__, "chunk_metric"): + __class__.chunk_metric = Histogram( + 'chunk_size', 'Chunk size', + buckets=[100, 160, 250, 400, 650, 1000, 1600, + 2500, 4000, 6400, 10000, 16000] + ) + self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, @@ -67,6 +74,8 @@ class Processor(ConsumerProducer): chunk=chunk.page_content.encode("utf-8"), ) + __class__.chunk_metric.observe(len(chunk.page_content)) + self.send(r) print("Done.", flush=True) diff --git a/trustgraph/chunking/token/chunker.py b/trustgraph/chunking/token/chunker.py index 78afeff3..c152b0fd 100755 --- a/trustgraph/chunking/token/chunker.py +++ b/trustgraph/chunking/token/chunker.py @@ -5,6 +5,7 @@ as text as separate output objects. """ from langchain_text_splitters import TokenTextSplitter +from prometheus_client import Histogram from ... schema import TextDocument, Chunk, Source from ... schema import text_ingest_queue, chunk_ingest_queue @@ -37,6 +38,13 @@ class Processor(ConsumerProducer): } ) + if not hasattr(__class__, "chunk_metric"): + __class__.chunk_metric = Histogram( + 'chunk_size', 'Chunk size', + buckets=[100, 160, 250, 400, 650, 1000, 1600, + 2500, 4000, 6400, 10000, 16000] + ) + self.text_splitter = TokenTextSplitter( encoding_name="cl100k_base", chunk_size=chunk_size, @@ -65,6 +73,8 @@ class Processor(ConsumerProducer): chunk=chunk.page_content.encode("utf-8"), ) + __class__.chunk_metric.observe(len(chunk.page_content)) + self.send(r) print("Done.", flush=True)