Prompt refactor (#125)

* Prompt manager integrated and working with 6 tests
* Updated templates to for prompt-template update
This commit is contained in:
cybermaggedon 2024-10-26 22:17:43 +01:00 committed by GitHub
parent 51aef6c730
commit 1e137768ca
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
19 changed files with 649 additions and 479 deletions

View file

@ -13,7 +13,7 @@ local prompts = import "prompts/mixtral.jsonnet";
create:: function(engine)
local envSecrets = engine.envSecrets("bedrock-credentials")
.with_env_var("GOOGLEAISTUDIO_KEY", "googleaistudio-key");
.with_env_var("GOOGLE_AI_STUDIO_KEY", "googleaistudio-key");
local container =
engine.container("text-completion")

View file

@ -17,22 +17,38 @@ local default_prompts = import "prompts/default-prompts.jsonnet";
"prompt-template",
"-p",
url.pulsar,
"--text-completion-request-queue",
"non-persistent://tg/request/text-completion",
"--text-completion-response-queue",
"non-persistent://tg/response/text-completion-response",
"--definition-template",
"--system-prompt",
$["system-template"],
"--prompt",
"question={{question}}",
"extract-definitions=" +
$["prompt-definition-template"],
"--relationship-template",
"extract-relationships=" +
$["prompt-relationship-template"],
"--topic-template",
"extract-topics=" +
$["prompt-topic-template"],
"--knowledge-query-template",
"kg-prompt=" +
$["prompt-knowledge-query-template"],
"--document-query-template",
"document-prompt=" +
$["prompt-document-query-template"],
"--rows-template",
"extract-rows=" +
$["prompt-rows-template"],
"--prompt-response-type",
"extract-definitions=json",
"extract-relationships=json",
"extract-topics=json",
"kg-prompt=text",
"document-prompt=text",
"extract-rows=json",
])
.with_limits("0.5", "128M")
.with_reservations("0.1", "128M");
@ -71,18 +87,33 @@ local default_prompts = import "prompts/default-prompts.jsonnet";
"non-persistent://tg/request/text-completion-rag",
"--text-completion-response-queue",
"non-persistent://tg/response/text-completion-rag-response",
"--definition-template",
"--system-prompt",
$["system-template"],
"--prompt",
"question={{question}}",
"extract-definitions=" +
$["prompt-definition-template"],
"--relationship-template",
"extract-relationships=" +
$["prompt-relationship-template"],
"--topic-template",
"extract-topics=" +
$["prompt-topic-template"],
"--knowledge-query-template",
"kg-prompt=" +
$["prompt-knowledge-query-template"],
"--document-query-template",
"document-prompt=" +
$["prompt-document-query-template"],
"--rows-template",
"extract-rows=" +
$["prompt-rows-template"],
"--prompt-response-type",
"extract-definitions=json",
"extract-relationships=json",
"extract-topics=json",
"kg-prompt=text",
"document-prompt=text",
"extract-rows=json",
])
.with_limits("0.5", "128M")
.with_reservations("0.1", "128M");

View file

@ -4,16 +4,18 @@
{
"prompt-definition-template":: "<instructions>\nStudy the following text and derive definitions for any discovered entities.\nDo not provide definitions for entities whose definitions are incomplete\nor unknown.\nOutput relationships in JSON format as an arary of objects with fields:\n- entity: the name of the entity\n- definition: English text which defines the entity\n</instructions>\n\n<text>\n{text}\n</text>\n\n<requirements>\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not use special characters in the abstract text. The\nabstract will be written as plain text. Do not add markdown formatting\nor headers or prefixes. Do not include null or unknown definitions.\n</requirements>",
"system-template":: "You are a helpful assistant.",
"prompt-relationship-template":: "<instructions>\nStudy the following text and derive entity relationships. For each\nrelationship, derive the subject, predicate and object of the relationship.\nOutput relationships in JSON format as an arary of objects with fields:\n- subject: the subject of the relationship\n- predicate: the predicate\n- object: the object of the relationship\n- object-entity: false if the object is a simple data type: name, value or date. true if it is an entity.\n</instructions>\n\n<text>\n{text}\n</text>\n\n<requirements>\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not use special characters in the abstract text. The\nabstract must be written as plain text. Do not add markdown formatting\nor headers or prefixes.\n</requirements>",
"prompt-definition-template":: "<instructions>\nStudy the following text and derive definitions for any discovered entities.\nDo not provide definitions for entities whose definitions are incomplete\nor unknown.\nOutput relationships in JSON format as an arary of objects with fields:\n- entity: the name of the entity\n- definition: English text which defines the entity\n</instructions>\n\n<text>\n{{text}}\n</text>\n\n<requirements>\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not use special characters in the abstract text. The\nabstract will be written as plain text. Do not add markdown formatting\nor headers or prefixes. Do not include null or unknown definitions.\n</requirements>",
"prompt-topic-template":: "You are a helpful assistant that performs information extraction tasks for a provided text.\nRead the provided text. You will identify topics and their definitions in JSON.\n\nReading Instructions:\n- Ignore document formatting in the provided text.\n- Study the provided text carefully.\n\nHere is the text:\n{text}\n\nResponse Instructions: \n- Do not respond with special characters.\n- Return only topics that are concepts and unique to the provided text.\n- Respond only with well-formed JSON.\n- The JSON response shall be an array of objects with keys \"topic\" and \"definition\". \n- The JSON response shall use the following structure:\n\n```json\n[{{\"topic\": string, \"definition\": string}}]\n```\n\n- Do not write any additional text or explanations.",
"prompt-relationship-template":: "<instructions>\nStudy the following text and derive entity relationships. For each\nrelationship, derive the subject, predicate and object of the relationship.\nOutput relationships in JSON format as an arary of objects with fields:\n- subject: the subject of the relationship\n- predicate: the predicate\n- object: the object of the relationship\n- object-entity: false if the object is a simple data type: name, value or date. true if it is an entity.\n</instructions>\n\n<text>\n{{text}}\n</text>\n\n<requirements>\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not use special characters in the abstract text. The\nabstract must be written as plain text. Do not add markdown formatting\nor headers or prefixes.\n</requirements>",
"prompt-knowledge-query-template":: "Study the following set of knowledge statements. The statements are written in Cypher format that has been extracted from a knowledge graph. Use only the provided set of knowledge statements in your response. Do not speculate if the answer is not found in the provided set of knowledge statements.\n\nHere's the knowledge statements:\n{graph}\n\nUse only the provided knowledge statements to respond to the following:\n{query}\n",
"prompt-topic-template":: "You are a helpful assistant that performs information extraction tasks for a provided text.\nRead the provided text. You will identify topics and their definitions in JSON.\n\nReading Instructions:\n- Ignore document formatting in the provided text.\n- Study the provided text carefully.\n\nHere is the text:\n{{text}}\n\nResponse Instructions: \n- Do not respond with special characters.\n- Return only topics that are concepts and unique to the provided text.\n- Respond only with well-formed JSON.\n- The JSON response shall be an array of objects with keys \"topic\" and \"definition\". \n- The JSON response shall use the following structure:\n\n```json\n[{\"topic\": string, \"definition\": string}]\n```\n\n- Do not write any additional text or explanations.",
"prompt-document-query-template":: "Study the following context. Use only the information provided in the context in your response. Do not speculate if the answer is not found in the provided set of knowledge statements.\n\nHere is the context:\n{documents}\n\nUse only the provided knowledge statements to respond to the following:\n{query}\n",
"prompt-knowledge-query-template":: "Study the following set of knowledge statements. The statements are written in Cypher format that has been extracted from a knowledge graph. Use only the provided set of knowledge statements in your response. Do not speculate if the answer is not found in the provided set of knowledge statements.\n\nHere's the knowledge statements:\n{% for edge in knowledge %}({{edge.s}})-[{{edge.p}}]->({{edge.o}})\n{%endfor%}\n\nUse only the provided knowledge statements to respond to the following:\n{{query}}\n",
"prompt-rows-template":: "<instructions>\nStudy the following text and derive objects which match the schema provided.\n\nYou must output an array of JSON objects for each object you discover\nwhich matches the schema. For each object, output a JSON object whose fields\ncarry the name field specified in the schema.\n</instructions>\n\n<schema>\n{schema}\n</schema>\n\n<text>\n{text}\n</text>\n\n<requirements>\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not add markdown formatting or headers or prefixes.\n</requirements>",
"prompt-document-query-template":: "Study the following context. Use only the information provided in the context in your response. Do not speculate if the answer is not found in the provided set of knowledge statements.\n\nHere is the context:\n{{documents}}\n\nUse only the provided knowledge statements to respond to the following:\n{{query}}\n",
"prompt-rows-template":: "<instructions>\nStudy the following text and derive objects which match the schema provided.\n\nYou must output an array of JSON objects for each object you discover\nwhich matches the schema. For each object, output a JSON object whose fields\ncarry the name field specified in the schema.\n</instructions>\n\n<schema>\n{{schema}}\n</schema>\n\n<text>\n{{text}}\n</text>\n\n<requirements>\nYou will respond only with raw JSON format data. Do not provide\nexplanations. Do not add markdown formatting or headers or prefixes.\n</requirements>",
}

27
tests/README.prompts Normal file
View file

@ -0,0 +1,27 @@
test-prompt-... is tested with this prompt set...
prompt-template \
-p pulsar://localhost:6650 \
--system-prompt 'You are a {{attitude}}, you are called {{name}}' \
--global-term \
'name=Craig' \
'attitude=LOUD, SHOUTY ANNOYING BOT' \
--prompt \
'question={{question}}' \
'french-question={{question}}' \
"analyze=Find the name and age in this text, and output a JSON structure containing just the name and age fields: {{description}}. Don't add markup, just output the raw JSON object." \
"graph-query=Study the following knowledge graph, and then answer the question.\\n\nGraph:\\n{% for edge in knowledge %}({{edge.0}})-[{{edge.1}}]->({{edge.2}})\\n{%endfor%}\\nQuestion:\\n{{question}}" \
"extract-definition=Analyse the text provided, and then return a list of terms and definitions. The output should be a JSON array, each item in the array is an object with fields 'term' and 'definition'.Don't add markup, just output the raw JSON object. Here is the text:\\n{{text}}" \
--prompt-response-type \
'question=text' \
'analyze=json' \
'graph-query=text' \
'extract-definition=json' \
--prompt-term \
'question=name:Bonny' \
'french-question=attitude:French-speaking bot' \
--prompt-schema \
'analyze={ "type" : "object", "properties" : { "age": { "type" : "number" }, "name": { "type" : "string" } } }' \
'extract-definition={ "type": "array", "items": { "type": "object", "properties": { "term": { "type": "string" }, "definition": { "type": "string" } }, "required": [ "term", "definition" ] } }'

View file

@ -7,7 +7,13 @@ p = PromptClient(pulsar_host="pulsar://localhost:6650")
chunk = """I noticed a cat in my garden. It is a four-legged animal
which is a mammal and can be tame or wild. I wonder if it will be friends
with me. I think the cat's name is Fred and it has 4 legs"""
with me. I think the cat's name is Fred and it has 4 legs.
A cat is a small mammal.
A grapefruit is a citrus fruit.
"""
resp = p.request_definitions(
chunk=chunk,

19
tests/test-lang-topics Executable file
View file

@ -0,0 +1,19 @@
#!/usr/bin/env python3
import pulsar
from trustgraph.clients.prompt_client import PromptClient
p = PromptClient(pulsar_host="pulsar://localhost:6650")
chunk = """I noticed a cat in my garden. It is a four-legged animal
which is a mammal and can be tame or wild. I wonder if it will be friends
with me. I think the cat's name is Fred and it has 4 legs"""
resp = p.request_topics(
chunk=chunk,
)
for d in resp:
print(d.topic)
print(" ", d.definition)

18
tests/test-prompt-analyze Executable file
View file

@ -0,0 +1,18 @@
#!/usr/bin/env python3
import json
from trustgraph.clients.prompt_client import PromptClient
p = PromptClient(pulsar_host="pulsar://localhost:6650")
description = """Fred is a 4-legged cat who is 12 years old"""
resp = p.request(
id="analyze",
terms = {
"description": description,
}
)
print(json.dumps(resp, indent=4))

46
tests/test-prompt-extraction Executable file
View file

@ -0,0 +1,46 @@
#!/usr/bin/env python3
import json
from trustgraph.clients.prompt_client import PromptClient
p = PromptClient(pulsar_host="pulsar://localhost:6650")
chunk="""
The Space Shuttle was a reusable spacecraft that transported astronauts and cargo to and from Earth's orbit. It was designed to launch like a rocket, maneuver in orbit like a spacecraft, and land like an airplane. The Space Shuttle was NASA's space transportation system and was used for many purposes, including:
Carrying astronauts
The Space Shuttle could carry up to seven astronauts at a time.
Launching, recovering, and repairing satellites
The Space Shuttle could launch satellites into orbit, recover them, and repair them.
Building the International Space Station
The Space Shuttle carried large parts into space to build the International Space Station.
Conducting research
Astronauts conducted experiments in the Space Shuttle, which was like a science lab in space.
The Space Shuttle was retired in 2011 after the Columbia accident in 2003. The Columbia Accident Investigation Board report found that the Space Shuttle was unsafe and expensive to make safe.
Here are some other facts about the Space Shuttle:
The Space Shuttle was 184 ft tall and had a diameter of 29 ft.
The Space Shuttle had a mass of 4,480,000 lb.
The Space Shuttle's first flight was on April 12, 1981.
The Space Shuttle's last mission was in 2011.
"""
q = "Tell me some facts in the knowledge graph"
resp = p.request(
id="extract-definition",
terms = {
"text": chunk,
}
)
print(resp)
for fact in resp:
print(fact["term"], "::")
print(fact["definition"])
print()

View file

@ -0,0 +1,18 @@
#!/usr/bin/env python3
import pulsar
from trustgraph.clients.prompt_client import PromptClient
p = PromptClient(pulsar_host="pulsar://localhost:6650")
question = """What is the square root of 16?"""
resp = p.request(
id="french-question",
terms = {
"question": question
}
)
print(resp)

44
tests/test-prompt-knowledge Executable file
View file

@ -0,0 +1,44 @@
#!/usr/bin/env python3
import json
from trustgraph.clients.prompt_client import PromptClient
p = PromptClient(pulsar_host="pulsar://localhost:6650")
knowledge = [
("accident", "evoked", "a wide range of deeply felt public responses"),
("Space Shuttle concept", "had", "genesis"),
("Commission", "had", "a mandate to develop recommendations for corrective or other action based upon the Commission's findings and determinations"),
("Commission", "established", "teams of persons"),
("Space Shuttle Challenger", "http://www.w3.org/2004/02/skos/core#definition", "A space shuttle that was destroyed in an accident during mission 51-L."),
("The mid fuselage", "contains", "the payload bay"),
("Volume I", "contains", "Chapter IX"),
("accident", "resulted in", "firm national resolve that those men and women be forever enshrined in the annals of American heroes"),
("Volume I", "contains", "Chapter VII"),
("Volume I", "contains", "Chapter II"),
("Volume I", "contains", "Chapter V"),
("Commission", "believes", "its investigation and report have been responsive to the request of the President and hopes that they will serve the best interests of the nation in restoring the United States space program to its preeminent position in the world"),
("Commission", "construe", "mandate"),
("accident", "became", "a milestone on the way to achieving the full potential that space offers to mankind"),
("Volume I", "contains", "The Commission"),
("Commission", "http://www.w3.org/2004/02/skos/core#definition", "A group established to investigate the space shuttle accident"),
("Volume I", "contains", "Appendix D"),
("Commission", "had", "a mandate to review the circumstances surrounding the accident to establish the probable cause or causes of the accident"),
("Volume I", "contains", "Recommendations")
]
q = "Tell me some facts in the knowledge graph"
resp = p.request(
id="graph-query",
terms = {
"name": "Jayney",
"knowledge": knowledge,
"question": q
}
)
print(resp)

18
tests/test-prompt-question Executable file
View file

@ -0,0 +1,18 @@
#!/usr/bin/env python3
import pulsar
from trustgraph.clients.prompt_client import PromptClient
p = PromptClient(pulsar_host="pulsar://localhost:6650")
question = """What is the square root of 16?"""
resp = p.request(
id="question",
terms = {
"question": question
}
)
print(resp)

View file

@ -0,0 +1,19 @@
#!/usr/bin/env python3
import pulsar
from trustgraph.clients.prompt_client import PromptClient
p = PromptClient(pulsar_host="pulsar://localhost:6650")
question = """What is the square root of 16?"""
resp = p.request(
id="question",
terms = {
"question": question,
"attitude": "Spanish-speaking bot"
}
)
print(resp)

View file

@ -1,7 +1,9 @@
import _pulsar
import json
import dataclasses
from .. schema import PromptRequest, PromptResponse, Fact, RowSchema, Field
from .. schema import PromptRequest, PromptResponse
from .. schema import prompt_request_queue
from .. schema import prompt_response_queue
from . base import BaseClient
@ -12,6 +14,23 @@ WARN=_pulsar.LoggerLevel.Warn
INFO=_pulsar.LoggerLevel.Info
DEBUG=_pulsar.LoggerLevel.Debug
@dataclasses.dataclass
class Definition:
name: str
definition: str
@dataclasses.dataclass
class Relationship:
s: str
p: str
o: str
o_entity: str
@dataclasses.dataclass
class Topic:
topic: str
definition: str
class PromptClient(BaseClient):
def __init__(
@ -38,63 +57,116 @@ class PromptClient(BaseClient):
output_schema=PromptResponse,
)
def request(self, id, terms, timeout=300):
resp = self.call(
id=id,
terms={
k: json.dumps(v)
for k, v in terms.items()
},
timeout=timeout
)
if resp.text: return resp.text
return json.loads(resp.object)
def request_definitions(self, chunk, timeout=300):
return self.call(
kind="extract-definitions", chunk=chunk,
defs = self.request(
id="extract-definitions",
terms={
"text": chunk
},
timeout=timeout
).definitions
def request_topics(self, chunk, timeout=300):
)
return self.call(
kind="extract-topics", chunk=chunk,
timeout=timeout
).topics
return [
Definition(name=d["entity"], definition=d["definition"])
for d in defs
]
def request_relationships(self, chunk, timeout=300):
return self.call(
kind="extract-relationships", chunk=chunk,
rels = self.request(
id="extract-relationships",
terms={
"text": chunk
},
timeout=timeout
).relationships
)
return [
Relationship(
s=d["subject"],
p=d["predicate"],
o=d["object"],
o_entity=d["object-entity"]
)
for d in rels
]
def request_topics(self, chunk, timeout=300):
topics = self.request(
id="extract-topics",
terms={
"text": chunk
},
timeout=timeout
)
return [
Topic(topic=d["topic"], definition=d["definition"])
for d in topics
]
def request_rows(self, schema, chunk, timeout=300):
return self.call(
kind="extract-rows", chunk=chunk,
row_schema=RowSchema(
name=schema.name,
description=schema.description,
fields=[
Field(
name=f.name, type=str(f.type), size=f.size,
primary=f.primary, description=f.description,
)
for f in schema.fields
]
),
return self.request(
id="extract-rows",
terms={
"chunk": chunk,
"row-schema": {
"name": schema.name,
"description": schema.description,
"fields": [
{
"name": f.name, "type": str(f.type),
"size": f.size, "primary": f.primary,
"description": f.description,
}
for f in schema.fields
]
}
},
timeout=timeout
).rows
)
def request_kg_prompt(self, query, kg, timeout=300):
return self.call(
kind="kg-prompt",
query=query,
kg=[
Fact(s=v[0], p=v[1], o=v[2])
for v in kg
],
return self.request(
id="kg-prompt",
terms={
"query": query,
"knowledge": [
{ "s": v[0], "p": v[1], "o": v[2] }
for v in kg
]
},
timeout=timeout
).answer
)
def request_document_prompt(self, query, documents, timeout=300):
return self.call(
kind="document-prompt",
query=query,
documents=documents,
return self.request(
id="document-prompt",
terms={
"query": query,
"documents": documents,
},
timeout=timeout
).answer
)

View file

@ -39,20 +39,21 @@ class Fact(Record):
# schema, chunk -> rows
class PromptRequest(Record):
kind = String()
chunk = String()
query = String()
kg = Array(Fact())
documents = Array(Bytes())
row_schema = RowSchema()
id = String()
# JSON encoded values
terms = Map(String())
class PromptResponse(Record):
# Error case
error = Error()
answer = String()
definitions = Array(Definition())
topics = Array(Topic())
relationships = Array(Relationship())
rows = Array(Map(String()))
# Just plain text
text = String()
# JSON encoded
object = String()
prompt_request_queue = topic(
'prompt', kind='non-persistent', namespace='request'

View file

@ -56,6 +56,8 @@ setuptools.setup(
"neo4j",
"tiktoken",
"google-generativeai",
"ibis",
"jsonschema",
],
scripts=[
"scripts/chunker-recursive",

View file

@ -0,0 +1,25 @@
prompt-template \
-p pulsar://localhost:6650 \
--system-prompt 'You are a {{attitude}}, you are called {{name}}' \
--global-term \
'name=Craig' \
'attitude=LOUD, SHOUTY ANNOYING BOT' \
--prompt \
'question={{question}}' \
'french-question={{question}}' \
"analyze=Find the name and age in this text, and output a JSON structure containing just the name and age fields: {{description}}. Don't add markup, just output the raw JSON object." \
"graph-query=Study the following knowledge graph, and then answer the question.\\n\nGraph:\\n{% for edge in knowledge %}({{edge.0}})-[{{edge.1}}]->({{edge.2}})\\n{%endfor%}\\nQuestion:\\n{{question}}" \
"extract-definition=Analyse the text provided, and then return a list of terms and definitions. The output should be a JSON array, each item in the array is an object with fields 'term' and 'definition'.Don't add markup, just output the raw JSON object. Here is the text:\\n{{text}}" \
--prompt-response-type \
'question=text' \
'analyze=json' \
'graph-query=text' \
'extract-definition=json' \
--prompt-term \
'question=name:Bonny' \
'french-question=attitude:French-speaking bot' \
--prompt-schema \
'analyze={ "type" : "object", "properties" : { "age": { "type" : "number" }, "name": { "type" : "string" } } }' \
'extract-definition={ "type": "array", "items": { "type": "object", "properties": { "term": { "type": "string" }, "definition": { "type": "string" } }, "required": [ "term", "definition" ] } }'

View file

@ -0,0 +1,95 @@
import ibis
import json
from jsonschema import validate
import re
from trustgraph.clients.llm_client import LlmClient
class PromptConfiguration:
def __init__(self, system_template, global_terms={}, prompts={}):
self.system_template = system_template
self.global_terms = global_terms
self.prompts = prompts
class Prompt:
def __init__(self, template, response_type = "text", terms=None, schema=None):
self.template = template
self.response_type = response_type
self.terms = terms
self.schema = schema
class PromptManager:
def __init__(self, llm, config):
self.llm = llm
self.config = config
self.terms = config.global_terms
self.prompts = config.prompts
try:
self.system_template = ibis.Template(config.system_template)
except:
raise RuntimeError("Error in system template")
self.templates = {}
for k, v in self.prompts.items():
try:
self.templates[k] = ibis.Template(v.template)
except:
raise RuntimeError(f"Error in template: {k}")
if v.terms is None:
v.terms = {}
def parse_json(self, text):
json_match = re.search(r'```(?:json)?(.*?)```', text, re.DOTALL)
if json_match:
json_str = json_match.group(1).strip()
else:
# If no delimiters, assume the entire output is JSON
json_str = text.strip()
return json.loads(json_str)
def invoke(self, id, input):
if id not in self.prompts:
raise RuntimeError("ID invalid")
terms = self.terms | self.prompts[id].terms | input
resp_type = self.prompts[id].response_type
prompt = {
"system": self.system_template.render(terms),
"prompt": self.templates[id].render(terms)
}
resp = self.llm.request(**prompt)
print(resp, flush=True)
if resp_type == "text":
return resp
if resp_type != "json":
raise RuntimeError(f"Response type {resp_type} not known")
try:
obj = self.parse_json(resp)
except:
raise RuntimeError("JSON parse fail")
print(obj, flush=True)
if self.prompts[id].schema:
try:
print(self.prompts[id].schema)
validate(instance=obj, schema=self.prompts[id].schema)
except Exception as e:
raise RuntimeError(f"Schema validation fail: {e}")
return obj

View file

@ -1,47 +0,0 @@
def to_relationships(template, text):
return template.format(text=text)
def to_definitions(template, text):
return template.format(text=text)
def to_topics(template, text):
return template.format(text=text)
def to_rows(template, schema, text):
field_schema = [
f"- Name: {f.name}\n Type: {f.type}\n Definition: {f.description}"
for f in schema.fields
]
field_schema = "\n".join(field_schema)
return template.format(schema=schema, text=text)
schema = f"""Object name: {schema.name}
Description: {schema.description}
Fields:
{schema}"""
prompt = f""""""
return prompt
def get_cypher(kg):
sg2 = []
for f in kg:
sg2.append(f"({f.s})-[{f.p}]->({f.o})")
kg = "\n".join(sg2)
kg = kg.replace("\\", "-")
return kg
def to_kg_query(template, query, kg):
cypher = get_cypher(kg)
return template.format(query=query, graph=cypher)
def to_document_query(template, query, docs):
docs = "\n\n".join(docs)
return template.format(query=query, documents=docs)

View file

@ -16,8 +16,7 @@ from .... schema import prompt_request_queue, prompt_response_queue
from .... base import ConsumerProducer
from .... clients.llm_client import LlmClient
from . prompts import to_definitions, to_relationships, to_rows
from . prompts import to_kg_query, to_document_query, to_topics
from . prompt_manager import PromptConfiguration, Prompt, PromptManager
module = ".".join(__name__.split(".")[1:-1])
@ -29,6 +28,82 @@ class Processor(ConsumerProducer):
def __init__(self, **params):
prompt_base = {}
# Parsing the prompt information to the prompt configuration
# structure
prompt_arg = params.get("prompt", [])
if prompt_arg:
for p in prompt_arg:
toks = p.split("=", 1)
if len(toks) < 2:
raise RuntimeError(f"Prompt string not well-formed: {p}")
prompt_base[toks[0]] = {
"template": toks[1]
}
prompt_response_type_arg = params.get("prompt_response_type", [])
if prompt_response_type_arg:
for p in prompt_response_type_arg:
toks = p.split("=", 1)
if len(toks) < 2:
raise RuntimeError(f"Response type not well-formed: {p}")
if toks[0] not in prompt_base:
raise RuntimeError(f"Response-type, {toks[0]} not known")
prompt_base[toks[0]]["response_type"] = toks[1]
prompt_schema_arg = params.get("prompt_schema", [])
if prompt_schema_arg:
for p in prompt_schema_arg:
toks = p.split("=", 1)
if len(toks) < 2:
raise RuntimeError(f"Schema arg not well-formed: {p}")
if toks[0] not in prompt_base:
raise RuntimeError(f"Schema, {toks[0]} not known")
try:
prompt_base[toks[0]]["schema"] = json.loads(toks[1])
except:
raise RuntimeError(f"Failed to parse JSON schema: {p}")
prompt_term_arg = params.get("prompt_term", [])
if prompt_term_arg:
for p in prompt_term_arg:
toks = p.split("=", 1)
if len(toks) < 2:
raise RuntimeError(f"Term arg not well-formed: {p}")
if toks[0] not in prompt_base:
raise RuntimeError(f"Term, {toks[0]} not known")
kvtoks = toks[1].split(":", 1)
if len(kvtoks) < 2:
raise RuntimeError(f"Term not well-formed: {toks[1]}")
k, v = kvtoks
if "terms" not in prompt_base[toks[0]]:
prompt_base[toks[0]]["terms"] = {}
prompt_base[toks[0]]["terms"][k] = v
global_terms = {}
global_term_arg = params.get("global_term", [])
if global_term_arg:
for t in global_term_arg:
toks = t.split("=", 1)
if len(toks) < 2:
raise RuntimeError(f"Global term arg not well-formed: {t}")
global_terms[toks[0]] = toks[1]
print(global_terms)
prompts = {
k: Prompt(**v)
for k, v in prompt_base.items()
}
prompt_configuration = PromptConfiguration(
system_template = params.get("system_prompt", ""),
global_terms = global_terms,
prompts = prompts
)
input_queue = params.get("input_queue", default_input_queue)
output_queue = params.get("output_queue", default_output_queue)
subscriber = params.get("subscriber", default_subscriber)
@ -64,23 +139,21 @@ class Processor(ConsumerProducer):
pulsar_host = self.pulsar_host
)
self.definition_template = definition_template
self.topic_template = topic_template
self.relationship_template = relationship_template
self.rows_template = rows_template
self.knowledge_query_template = knowledge_query_template
self.document_query_template = document_query_template
# System prompt hack
class Llm:
def __init__(self, llm):
self.llm = llm
def request(self, system, prompt):
print(system)
print(prompt, flush=True)
return self.llm.request(system + "\n\n" + prompt)
def parse_json(self, text):
json_match = re.search(r'```(?:json)?(.*?)```', text, re.DOTALL)
if json_match:
json_str = json_match.group(1).strip()
else:
# If no delimiters, assume the entire output is JSON
json_str = text.strip()
self.llm = Llm(self.llm)
return json.loads(json_str)
self.manager = PromptManager(
llm = self.llm,
config = prompt_configuration,
)
def handle(self, msg):
@ -90,88 +163,52 @@ class Processor(ConsumerProducer):
id = msg.properties()["id"]
kind = v.kind
print(f"Handling kind {kind}...", flush=True)
if kind == "extract-definitions":
self.handle_extract_definitions(id, v)
return
elif kind == "extract-topics":
self.handle_extract_topics(id, v)
return
elif kind == "extract-relationships":
self.handle_extract_relationships(id, v)
return
elif kind == "extract-rows":
self.handle_extract_rows(id, v)
return
elif kind == "kg-prompt":
self.handle_kg_prompt(id, v)
return
elif kind == "document-prompt":
self.handle_document_prompt(id, v)
return
else:
print("Invalid kind.", flush=True)
return
def handle_extract_definitions(self, id, v):
kind = v.id
try:
prompt = to_definitions(self.definition_template, v.chunk)
print(v.terms)
ans = self.llm.request(prompt)
input = {
k: json.loads(v)
for k, v in v.terms.items()
}
print(f"Handling kind {kind}...", flush=True)
print(input, flush=True)
# Silently ignore JSON parse error
try:
defs = self.parse_json(ans)
except:
print("JSON parse error, ignored", flush=True)
defs = []
resp = self.manager.invoke(kind, input)
output = []
if isinstance(resp, str):
for defn in defs:
print("Send text response...", flush=True)
print(resp, flush=True)
try:
e = defn["entity"]
d = defn["definition"]
r = PromptResponse(
text=resp,
object=None,
error=None,
)
if e == "": continue
if e is None: continue
if d == "": continue
if d is None: continue
self.producer.send(r, properties={"id": id})
output.append(
Definition(
name=e, definition=d
)
)
return
except:
print("definition fields missing, ignored", flush=True)
else:
print("Send response...", flush=True)
r = PromptResponse(definitions=output, error=None)
self.producer.send(r, properties={"id": id})
print("Send object response...", flush=True)
print(json.dumps(resp, indent=4), flush=True)
print("Done.", flush=True)
r = PromptResponse(
text=None,
object=json.dumps(resp),
error=None,
)
self.producer.send(r, properties={"id": id})
return
except Exception as e:
print(f"Exception: {e}")
@ -188,122 +225,6 @@ class Processor(ConsumerProducer):
self.producer.send(r, properties={"id": id})
def handle_extract_topics(self, id, v):
try:
prompt = to_topics(self.topic_template, v.chunk)
ans = self.llm.request(prompt)
# Silently ignore JSON parse error
try:
defs = self.parse_json(ans)
except:
print("JSON parse error, ignored", flush=True)
defs = []
output = []
for defn in defs:
try:
e = defn["topic"]
d = defn["definition"]
if e == "": continue
if e is None: continue
if d == "": continue
if d is None: continue
output.append(
Topic(
name=e, definition=d
)
)
except:
print("definition fields missing, ignored", flush=True)
print("Send response...", flush=True)
r = PromptResponse(topics=output, error=None)
self.producer.send(r, properties={"id": id})
print("Done.", flush=True)
except Exception as e:
print(f"Exception: {e}")
print("Send error response...", flush=True)
r = PromptResponse(
error=Error(
type = "llm-error",
message = str(e),
),
response=None,
)
self.producer.send(r, properties={"id": id})
def handle_extract_relationships(self, id, v):
try:
prompt = to_relationships(self.relationship_template, v.chunk)
ans = self.llm.request(prompt)
# Silently ignore JSON parse error
try:
defs = self.parse_json(ans)
except:
print("JSON parse error, ignored", flush=True)
defs = []
output = []
for defn in defs:
try:
s = defn["subject"]
p = defn["predicate"]
o = defn["object"]
o_entity = defn["object-entity"]
if s == "": continue
if s is None: continue
if p == "": continue
if p is None: continue
if o == "": continue
if o is None: continue
if o_entity == "" or o_entity is None:
o_entity = False
output.append(
Relationship(
s = s,
p = p,
o = o,
o_entity = o_entity,
)
)
except Exception as e:
print("relationship fields missing, ignored", flush=True)
print("Send response...", flush=True)
r = PromptResponse(relationships=output, error=None)
self.producer.send(r, properties={"id": id})
print("Done.", flush=True)
except Exception as e:
print(f"Exception: {e}")
@ -320,147 +241,6 @@ class Processor(ConsumerProducer):
self.producer.send(r, properties={"id": id})
def handle_extract_rows(self, id, v):
try:
fields = v.row_schema.fields
prompt = to_rows(self.rows_template, v.row_schema, v.chunk)
print(prompt)
ans = self.llm.request(prompt)
print(ans)
# Silently ignore JSON parse error
try:
objs = self.parse_json(ans)
except:
print("JSON parse error, ignored", flush=True)
objs = []
output = []
for obj in objs:
try:
row = {}
for f in fields:
if f.name not in obj:
print(f"Object ignored, missing field {f.name}")
row = {}
break
row[f.name] = obj[f.name]
if row == {}:
continue
output.append(row)
except Exception as e:
print("row fields missing, ignored", flush=True)
for row in output:
print(row)
print("Send response...", flush=True)
r = PromptResponse(rows=output, error=None)
self.producer.send(r, properties={"id": id})
print("Done.", flush=True)
except Exception as e:
print(f"Exception: {e}")
print("Send error response...", flush=True)
r = PromptResponse(
error=Error(
type = "llm-error",
message = str(e),
),
response=None,
)
self.producer.send(r, properties={"id": id})
def handle_kg_prompt(self, id, v):
try:
prompt = to_kg_query(self.knowledge_query_template, v.query, v.kg)
print(prompt)
ans = self.llm.request(prompt)
print(ans)
print("Send response...", flush=True)
r = PromptResponse(answer=ans, error=None)
self.producer.send(r, properties={"id": id})
print("Done.", flush=True)
except Exception as e:
print(f"Exception: {e}")
print("Send error response...", flush=True)
r = PromptResponse(
error=Error(
type = "llm-error",
message = str(e),
),
response=None,
)
self.producer.send(r, properties={"id": id})
def handle_document_prompt(self, id, v):
try:
prompt = to_document_query(
self.document_query_template, v.query, v.documents
)
print(prompt)
ans = self.llm.request(prompt)
print(ans)
print("Send response...", flush=True)
r = PromptResponse(answer=ans, error=None)
self.producer.send(r, properties={"id": id})
print("Done.", flush=True)
except Exception as e:
print(f"Exception: {e}")
print("Send error response...", flush=True)
r = PromptResponse(
error=Error(
type = "llm-error",
message = str(e),
),
response=None,
)
self.producer.send(r, properties={"id": id})
@staticmethod
def add_args(parser):
@ -482,39 +262,33 @@ class Processor(ConsumerProducer):
)
parser.add_argument(
'--definition-template',
required=True,
help=f'Definition extraction template',
'--prompt', nargs='*',
help=f'Prompt template form id=template',
)
parser.add_argument(
'--topic-template',
required=True,
help=f'Topic extraction template',
'--prompt-response-type', nargs='*',
help=f'Prompt response type, form id=json|text',
)
parser.add_argument(
'--rows-template',
required=True,
help=f'Rows extraction template',
'--prompt-term', nargs='*',
help=f'Prompt response type, form id=key:value',
)
parser.add_argument(
'--relationship-template',
required=True,
help=f'Relationship extraction template',
'--prompt-schema', nargs='*',
help=f'Prompt response schema, form id=schema',
)
parser.add_argument(
'--knowledge-query-template',
required=True,
help=f'Knowledge query template',
'--system-prompt',
help=f'System prompt template',
)
parser.add_argument(
'--document-query-template',
required=True,
help=f'Document query template',
'--global-term', nargs='+',
help=f'Global term, form key:value'
)
def run():