Bat ai/log analysis (NVIDIA#355)

Santhoshcharugulla001 · web-flow · commit d8a643406d7a · 2025-09-26T08:47:56.000-07:00
* log_analysis using self corrective RAG

* updated to ChatNVIDIA

* removed chatopenai

* updated with the minor changes

* minor changes

* minor changes readme

* minor fixes

* minor changes in readme

* updated readme

* updated to nemotron model

* updated requirements

* avoided the reccursion limit

* updated to Nemotrom model and changed reasoning prompts
diff --git a/community/log_analysis_multi_agent_rag/README.md b/community/log_analysis_multi_agent_rag/README.md
@@ -48,7 +48,7 @@ This repository provides a sample code to demonstrate how you can use the log an
 
 # Software Components
 NVIDIA NIM Microservices
-- NIM of meta/llama-3.1-70b-instruct
+- NIM of nvidia/llama-3.3-nemotron-super-49b-v1.5
 - Retriever Models
 - NIM of nvidia/llama-3_2-nv-embedqa-1b-v2
 - NIM of nvidia/llama-3_2-nv-rerankqa-1b-v2
diff --git a/community/log_analysis_multi_agent_rag/graphedges.py b/community/log_analysis_multi_agent_rag/graphedges.py
@@ -11,6 +11,12 @@ def decide_to_generate(state):
         print("ASSESS GRADED DOCUMENTS")
         state["question"]
         filtered_documents = state["documents"]
+        transform_count = state.get("transform_count", 0)
+
+        # If we've transformed too many times, force generation
+        if transform_count >= 2:
+            print("DECISION: MAX TRANSFORMS REACHED, FORCING GENERATION")
+            return "generate"
 
         if not filtered_documents:
             print(
@@ -33,10 +39,21 @@ def grade_generation_vs_documents_and_question(state):
         generation = state["generation"]
 
         print("GRADE GENERATED vs QUESTION")
-        score = automation.answer_grader.invoke({"question": question, "generation": generation})
-        grade = score.binary_score
-        if grade == "yes":
-            print("DECISION: GENERATION ADDRESSES QUESTION")
-            return "useful"
-        print("DECISION: GENERATION DOES NOT ADDRESS QUESTION")
-        return "not useful"
+        try:
+            score_text = automation.answer_grader.invoke({"question": question, "generation": generation})
+            if "yes" in score_text.lower():
+                print("DECISION: GENERATION ADDRESSES QUESTION")
+                return "useful"
+            else:
+                # Check if we've transformed too many times
+                transform_count = state.get("transform_count", 0)
+                if transform_count >= 2:
+                    print("DECISION: MAX TRANSFORMS REACHED, ACCEPTING GENERATION")
+                    return "useful"
+                else:
+                    print("DECISION: GENERATION DOES NOT ADDRESS QUESTION")
+                    return "not useful"
+        except:
+            # If grading fails, assume generation is useful to avoid infinite loops
+            print("DECISION: GRADING FAILED, ACCEPTING GENERATION")
+            return "useful"
diff --git a/community/log_analysis_multi_agent_rag/multiagent.py b/community/log_analysis_multi_agent_rag/multiagent.py
@@ -18,13 +18,13 @@ def __init__(self, file_path, api_key):
         self.hybrid_retriever = self.create_hybrid_retriever()
 
     def initialize_nvidia_components(self):
-        embeddings =NVIDIAEmbeddings(model="nvidia/llama-3.2-nv-embedqa-1b-v2", truncate="NONE")
+        embeddings =NVIDIAEmbeddings(model="nvidia/llama-3.2-nv-embedqa-1b-v2", truncate="END")
         return  embeddings
 
     def load_and_split_documents(self):
         loader = TextLoader(self.file_path)
         docs = loader.load()
-        text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=600)
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=20000, chunk_overlap=10000)
         doc_splits = text_splitter.split_documents(docs)
         return doc_splits
 
@@ -35,7 +35,7 @@ def create_retrievers(self):
         return bm25_retriever, faiss_retriever
 
     def create_hybrid_retriever(self):
-        hybrid_retriever = EnsembleRetriever(retrievers=[self.bm25_retriever, self.faiss_retriever], weights=[0.7, 0.3])
+        hybrid_retriever = EnsembleRetriever(retrievers=[self.bm25_retriever, self.faiss_retriever], weights=[0.5, 0.5])
         return hybrid_retriever
 
     def get_retriever(self):
diff --git a/community/log_analysis_multi_agent_rag/prompt.json b/community/log_analysis_multi_agent_rag/prompt.json
@@ -1,12 +1,12 @@
 {
-  "qa_system_prompt": "Act as an experienced QA automation engineer with expertise in analyzing logs and extract details from the same. Your job is to analyze the provided log file and answer user questions to help them file an actionable bug. Answer solely based on the following context:\n<Documents>\n{context}",
-  "qa_user_prompt": "{question}",
-  "re_write_system": "You are an expert in prompt engineering for GenAI RAG application. Your job is to write effective prompt to help retrier in fetching accruate documents. You a question re-writer that converts an input question to a better version that is optimized for vectorstore retrieval.",
-  "re_write_human": "\n\nHere is the initial prompt: \n\n {question} \n Formulate an improved prompt by keeping the original intent to make sure accurate results get generated.",
-  "grade_system": "You are a grader assessing relevance of a retrieved document to a user question. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.",
-  "grade_human": "Retrieved document: \n\n {document} \n\n User question: {question}",
-  "hallucination_system": "You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. Give a binary score 'yes' or 'no'. 'Yes' means that the answer is grounded in / supported by the set of facts.",
-  "hallucination_human": "Set of facts: \n\n {documents} \n\n LLM generation: {generation}",
-  "answer_system": "You are a grader assessing whether an answer addresses / resolves a question. Give a binary score 'yes' or 'no'. 'Yes' means that the answer resolves the question.",
-  "answer_human": "User question: \n\n {question} \n\n LLM generation: {generation}"
+  "qa_system_prompt": "You are an expert QA automation engineer specializing in log analysis and debugging. Your task is to analyze log files and provide accurate, actionable insights.\n\nINSTRUCTIONS:\n1. Base your analysis STRICTLY on the provided log context - do not add information not present in the logs\n2. Structure your response with clear sections: Summary, Key Issues, Error Details, and Recommendations\n3. Focus on actionable findings that help developers debug issues\n4. When analyzing errors, include timestamps, error codes, and relevant context\n5. If information is missing or unclear in the logs, explicitly state this limitation\n\nCONTEXT DOCUMENTS:\n{context}\n\nProvide a comprehensive analysis based solely on the above log data.",
+  "qa_user_prompt": "Question: {question}\n\nPlease analyze the log data and provide a detailed response following the structured format outlined in the system instructions.",
+  "re_write_system": "You are a prompt optimization specialist for RAG (Retrieval-Augmented Generation) systems. Your goal is to rewrite user queries to improve document retrieval accuracy for log analysis tasks.\n\nREWRITE GUIDELINES:\n1. Preserve the original intent and scope of the question\n2. Add relevant technical keywords related to logging, debugging, and software testing\n3. Make the query more specific to improve vector similarity matching\n4. Include common log analysis terms like 'error', 'failure', 'exception', 'stack trace', 'timestamp'\n5. Structure the query to match how information typically appears in log files\n\nEXAMPLE:\nOriginal: 'What went wrong?'\nRewritten: 'What error messages, exceptions, or failure indicators are present in the log file with their timestamps and context?'",
+  "re_write_human": "Original query: {question}\n\nRewrite this query to be more effective for retrieving relevant log analysis documents. Focus on:\n- Adding specific logging terminology\n- Making the intent clearer\n- Including context about what type of log information is needed\n\nRewritten query:",
+  "grade_system": "You are a document relevance evaluator for a log analysis system. Your task is to determine if a retrieved document contains information relevant to answering a user's question about log analysis.\n\nEVALUATION CRITERIA:\n- Document contains keywords, error messages, or concepts related to the question\n- Document provides context about system behavior, errors, or debugging information\n- Document includes timestamps, error codes, or technical details relevant to the query\n- Even partial relevance should be considered as 'yes' to avoid missing important context\n\nRESPONSE FORMAT: Respond with ONLY a JSON object containing a single key 'binary_score' with value 'yes' or 'no'.\n\nEXAMPLE RESPONSES:\n{{\"binary_score\": \"yes\"}}\n{{\"binary_score\": \"no\"}}",
+  "grade_human": "DOCUMENT TO EVALUATE:\n{document}\n\nUSER QUESTION:\n{question}\n\nIs this document relevant to answering the user's question? Consider any log entries, error messages, timestamps, or system information that could help address the query.",
+  "hallucination_system": "You are a fact-checking specialist for log analysis responses. Your task is to verify if an AI-generated answer is fully supported by the provided log documents.\n\nVERIFICATION PROCESS:\n1. Check if all specific claims (error messages, timestamps, file names) appear in the source documents\n2. Verify that interpretations and conclusions are logically derived from the log data\n3. Ensure no external knowledge or assumptions are added beyond what's in the logs\n4. Flag any statements that cannot be directly traced to the provided documents\n\nRESPONSE FORMAT: Respond with ONLY a JSON object containing 'binary_score' with value 'yes' (grounded) or 'no' (contains hallucinations).\n\nEXAMPLE RESPONSES:\n{{\"binary_score\": \"yes\"}}\n{{\"binary_score\": \"no\"}}",
+  "hallucination_human": "SOURCE DOCUMENTS:\n{documents}\n\nAI GENERATION TO VERIFY:\n{generation}\n\nIs the AI generation fully grounded in and supported by the source documents? Check for any added information, assumptions, or claims not present in the logs.",
+  "answer_system": "You are a response quality evaluator for log analysis tasks. Your job is to determine if an AI-generated answer adequately addresses the user's question about log analysis.\n\nEVALUATION CRITERIA:\n1. Answer directly addresses the specific question asked\n2. Provides relevant log analysis information (errors, patterns, recommendations)\n3. Includes specific details from the logs when available\n4. Offers actionable insights for debugging or investigation\n5. Acknowledges limitations if insufficient log data is available\n\nRESPONSE FORMAT: Respond with ONLY a JSON object containing 'binary_score' with value 'yes' (addresses question) or 'no' (does not address question).\n\nEXAMPLE RESPONSES:\n{{\"binary_score\": \"yes\"}}\n{{\"binary_score\": \"no\"}}",
+  "answer_human": "USER QUESTION:\n{question}\n\nAI GENERATED ANSWER:\n{generation}\n\nDoes the AI answer adequately address the user's question about log analysis? Consider completeness, relevance, and actionability of the response."
 }
diff --git a/community/log_analysis_multi_agent_rag/requirements.txt b/community/log_analysis_multi_agent_rag/requirements.txt
diff --git a/community/log_analysis_multi_agent_rag/utils.py b/community/log_analysis_multi_agent_rag/utils.py
@@ -67,6 +67,6 @@ def format_docs(self, docs):
 
 # Access the API key from environment variables
 api_key = os.getenv('API_KEY')
-model = "meta/llama-3.1-70b-instruct"
+model = "nvidia/llama-3.3-nemotron-super-49b-v1.5"
 prompts_file = "prompt.json"
 automation = Nodeoutputs(api_key, model, prompts_file)