RAG Research Assistant

Project Goal:

This work seeks to augment a LLM to assist with learning the salient points in research papers. Reading research papers can be time intensive, so using a chatbot that can sumarize and respond to questions about a paper can help to reduce the time necessary to keep up to date on AI research developments.

Approach:

I utilized a RAG system to feed context from research papers to GPT 4.0 -

Example Results:

Papers Uploaded: Retrieval Augmented Generation or Long-Context LLMs? A Comprehensive Study and Hybrid Approach (Zhouwan et al. 2024) and Searching for Best Practices in Retrieval-Augmented Generation (Wang et al. 2024)

Highlighted Code Sections:


import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader

# Load in papers from folder
docs = []
for file in os.listdir(pdf_folder_path):
    if file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder_path, file)
        loader = PyPDFLoader(pdf_path)
        loaded_docs = loader.load()
        for doc in loaded_docs:
          filename = os.path.basename(file)
          print(filename)
          doc.metadata["source"] = filename
        docs.extend(loaded_docs)

 

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(documents=splits,
                                    embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()
    
 

# RETRIEVAL and GENERATION
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

from langchain.schema.runnable import RunnableMap, RunnableLambda, RunnablePassthrough

def format_docs_with_metadata(docs):
    return {
        "context": "\n\n".join(doc.page_content for doc in docs),
        "source_docs": docs
    }

#Build the chain to keep sources
rag_chain = (
    RunnableMap({
        "context": retriever | RunnableLambda(format_docs_with_metadata),
        "question": RunnablePassthrough()
    })
    | RunnableLambda(lambda inputs: {
        "question": inputs["question"],
        "context": inputs["context"]["context"],
        "source_docs": inputs["context"]["source_docs"]
    })
    | RunnableMap({
        "response": prompt | llm | StrOutputParser(),
        "source_docs": lambda x: x["source_docs"]
    })
)

#formatt result so it prints the source at the end
def format_response(result):
  response = result["response"]
  sources = {doc.metadata.get("source", "Unknown") for doc in result["source_docs"]}
  response_with_sources = f"{response}\n\nSources: {', '.join(sources)}"
  return response_with_sources