0% found this document useful (0 votes)
112 views

Streamlit PDF Application Setup All Commands in One Single File

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
112 views

Streamlit PDF Application Setup All Commands in One Single File

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 8

===================================================================================

==========================
COMPLETE STEP BY STEP GUIDE FOR PREPARING STREAMLIT APPLICATION FOR PDF FILES
===================================================================================
==========================

Step 1:
~~~~~~~

sqlplus sys/WElcome__123@VECTORDB as sysdba


-- CREATE TABLESPACE vectortbs DATAFILE 'vectordf01.dbf' SIZE 2G AUTOEXTEND ON
EXTENT MANAGEMENT LOCAL SEGMENT SPACE MANAGEMENT AUTO;

drop user baskar cascade;


create user baskar identified by WElcome__123 DEFAULT TABLESPACE vectortbs quota
unlimited on vectortbs;
grant connect, ctxapp, unlimited tablespace, create credential, create procedure,
create table, dba, sysdba to baskar;
grant create mining model, DB_DEVELOPER_ROLE to baskar;
grant execute on sys.dmutil_lib to baskar;
grant execute on DBMS_NETWORK_ACL_ADMIN to baskar;
grant execute on DBMS_VECTOR to baskar;
grant execute on DBMS_VECTOR_CHAIN to baskar;

create or replace directory PDF_DIR as '/home/oracle/pdf_files';


create or replace directory MODEL_DIR as '/home/oracle/';
grant read, write on directory PDF_DIR to baskar;
grant read, write on directory MODEL_DIR to baskar;
commit;
exit;

Step 2:
~~~~~~~

sqlplus baskar/WElcome__123@VECTORDB

exec dbms_vector.drop_onnx_model(model_name => 'doc_model', force => true);


exec dbms_vector.load_onnx_model('MODEL_DIR', 'my_embedding_model.onnx',
'doc_model', JSON('{"function" : "embedding", "embeddingOutput" : "embedding" ,
"input":{"input": ["DATA"]}}'));

Step 3:
~~~~~~~

BEGIN
DBMS_NETWORK_ACL_ADMIN.APPEND_HOST_ACE(
host => '*',
ace => xs$ace_type(
privilege_list => xs$name_list('connect'),
principal_name => 'baskar',
principal_type => xs_acl.ptype_db));
END;
/
Step 4:
~~~~~~~

BEGIN
DBMS_VECTOR.DROP_CREDENTIAL('OCI_GENAI_BASBABU_CRED');
EXCEPTION
WHEN OTHERS THEN NULL;
END;
/

DECLARE
jo json_object_t;
BEGIN
jo := json_object_t();
jo.put('user_ocid', '');
jo.put('tenancy_ocid', '');
jo.put('compartment_ocid', '');
jo.put('private_key', '');
jo.put('fingerprint', '');
dbms_output.put_line(jo.to_string);
dbms_vector.create_credential(
credential_name => 'OCI_GENAI_BASKAR_CRED',
params => json(jo.to_string));
END;
/

Step 5:
~~~~~~~

Create .env file in /home/oracle folder

cd /home/oracle/
vi .env

COMPARTMENT_OCID=ocid1.compartment.oc1..aaaaaaaa4e4qshiempprixk6pvgdk26txt5dy4dhtip
iwjjkmrre4wozi6lq
[DATABASE]
username=baskar
password=WElcome__123
dsn=localhost:1521/freepdb1

Step 6:
~~~~~~~

# Prepare Tables for Storing PDF Files and Vectorization

sqlplus baskar/WElcome__123@VECTORDB

drop table VECTOR_STORE cascade constraints purge;


drop table MY_BOOKS cascade constraints purge;

CREATE TABLE IF NOT EXISTS "MY_BOOKS"


(
ID INTEGER GENERATED BY DEFAULT ON NULL AS IDENTITY ( START WITH 1 CACHE 20 )
PRIMARY KEY,
file_name VARCHAR2 (900) ,
file_size INTEGER ,
file_type VARCHAR2 (100) ,
file_content BLOB )
LOGGING;

CREATE TABLE IF NOT EXISTS "VECTOR_STORE"


( "DOC_ID" NUMBER(*,0) NOT NULL ENABLE,
"EMBED_ID" NUMBER,
"EMBED_DATA" VARCHAR2(4000 BYTE),
"EMBED_VECTOR" VECTOR,
FOREIGN KEY (DOC_ID) REFERENCES MY_BOOKS(ID)
);

Step 7:
~~~~~~

# Insert one sample file.

insert into my_books(file_name, file_size, file_type, file_content) values


('Oracle_Analytics_Server.pdf', dbms_lob.getlength(to_blob(bfilename('VEC_DUMP',
'Oracle_Analytics_Server.pdf'))), 'PDF', to_blob(bfilename('VEC_DUMP',
'Oracle_Analytics_Server.pdf')));
commit;

Step 8:
~~~~~~~

# Perform the Vectorization for the PDF files and store it in VECTOR_STORE table

INSERT into VECTOR_STORE ( doc_id, embed_id, embed_data, embed_vector)


select id, embed_id, text_chunk ,embed_vector
from my_books dt
CROSS JOIN TABLE(
dbms_vector_chain.utl_to_embeddings(
dbms_vector_chain.utl_to_chunks(
dbms_vector_chain.utl_to_text(dt.file_content),
json('{"normalize":"all"}')
),
json('{"provider":"database", "model":"doc_model"}')
)
) t
CROSS JOIN JSON_TABLE(
t.column_value,
'$[*]' COLUMNS (
embed_id NUMBER PATH '$.embed_id',
text_chunk VARCHAR2(4000) PATH '$.embed_data',
embed_vector CLOB PATH '$.embed_vector'
)
) AS et;
commit;

Step 9:
~~~~~~~

# Do a sample Semantic Search with a hard-coded question:

WITH query_vector AS (
SELECT VECTOR_EMBEDDING(doc_model USING 'What is Oracle Analytics?' AS
data) as embedding)
SELECT embed_id, embed_data
FROM VECTOR_STORE, query_vector
ORDER BY VECTOR_DISTANCE(EMBED_VECTOR, query_vector.embedding, COSINE)
FETCH FIRST 4 ROWS ONLY;

Step 10:
~~~~~~~~

# Create a Function to accept the User Question and perform the Similarity Search
with the existing PDF files.
# This function will be called in streamlit python program

UNDEFINE
SET SERVEROUTPUT ON;
SET ECHO ON
SET FEEDBACK 1
SET NUMWIDTH 10
SET LINESIZE 80
SET TRIMSPOOL ON
SET TAB OFF
SET PAGESIZE 10000
SET LONG 10000
create or replace FUNCTION generate_text_response_gen(user_question IN VARCHAR2,
docid number) RETURN CLOB IS
user_question_vec VECTOR;
oci_genai_params CLOB;
context CLOB;
prompt CLOB;
response CLOB;
BEGIN
select to_vector(vector_embedding(doc_model USING user_question as data)) as
embedding into user_question_vec;
context := '';
FOR rec IN (SELECT embed_data FROM VECTOR_STORE where doc_id = docid order by
vector_distance(embed_vector, user_question_vec, COSINE) FETCH FIRST 4 ROWS ONLY)
LOOP
context := context || rec.embed_data;
END LOOP;
prompt := 'Answer the following question using the supplied context: '||
user_question || ' Context: ' || context;
prompt := RTRIM(prompt, ',' || CHR(10));
oci_genai_params := '{"provider": "ocigenai", "credential_name":
"OCI_GENAI_BASKAR_CRED", "url": "https://inference.generativeai.us-chicago-
1.oci.oraclecloud.com/20231130/actions/summarizeText", "model": "cohere.command"}';
response := DBMS_VECTOR_CHAIN.UTL_TO_SUMMARY(prompt, json(oci_genai_params));
RETURN response;
END;
/

Test the Response Output by passing the question and pdf docid to this Function:

select generate_text_response_gen('What is Oracle Analytics', 1) as Response from


dual;

Step 11:
~~~~~~~~

# Create a Procedure that will accept the new PDF file from User and store it in
the MY_BOOKS table.
# This procedure will be called in streamlit python program

create or replace PROCEDURE insert_my_table_row(p_file_name IN my_books.file_name


%TYPE, p_file_size IN my_books.file_size%TYPE, p_file_type IN my_books.file_type
%TYPE, p_file_content IN my_books.file_content%TYPE, p_new_id OUT number) IS
v_count NUMBER;
v_id number;
new_id number ;
BEGIN
-- Check if the combination of a and b already exists
BEGIN
SELECT id INTO new_id FROM MY_BOOKS WHERE file_name = p_file_name AND
file_size = p_file_size;
EXCEPTION WHEN NO_DATA_FOUND THEN
INSERT INTO MY_BOOKS (file_name, file_size, file_type, file_content) VALUES
(p_file_name, p_file_size, p_file_type, p_file_content) RETURNING id into new_id;
END;
p_new_id := new_id;
dbms_output.put_line(new_id);
COMMIT;
EXCEPTION
WHEN OTHERS THEN
-- Exception handling here, for example, a rollback or a custom error
message
DBMS_OUTPUT.PUT_LINE('Error: ' || SQLERRM);
--RAISE;
END insert_my_table_row;
/

Step 12:
~~~~~~~~

# Create a Trigger which will do the Vectorization for the newly uploaded PDF file
and store the vector embeddings in VECTOR_STORE table
# This trigger will be called in streamlit python program

CREATE OR REPLACE TRIGGER trg_mybooks_vector_store_compound


FOR INSERT ON my_books
COMPOUND TRIGGER
TYPE t_id_tab IS TABLE OF my_books.id%TYPE INDEX BY PLS_INTEGER;
v_ids t_id_tab;
AFTER EACH ROW IS
BEGIN
v_ids(v_ids.COUNT + 1) := :NEW.id;
END AFTER EACH ROW;
AFTER STATEMENT IS
BEGIN
FOR i IN 1 .. v_ids.COUNT LOOP
INSERT INTO vector_store (doc_id, embed_id, embed_data, embed_vector)
SELECT dt.id AS doc_id,
et.embed_id,
et.embed_data,
to_vector(et.embed_vector) AS embed_vector
FROM my_books dt
CROSS JOIN TABLE(
dbms_vector_chain.utl_to_embeddings(
dbms_vector_chain.utl_to_chunks(
dbms_vector_chain.utl_to_text(dt.file_content),
json('{"normalize":"all"}')
),
json('{"provider":"database", "model":"doc_model"}')
)
) t
CROSS JOIN JSON_TABLE(
t.column_value,
'$[*]' COLUMNS (
embed_id NUMBER PATH '$.embed_id',
embed_data VARCHAR2(4000) PATH '$.embed_data',
embed_vector CLOB PATH '$.embed_vector'
)
) AS et
WHERE dt.id = v_ids(i);
END LOOP;
END AFTER STATEMENT;
END trg_mybooks_vector_store_compound;
/

Step 13:
~~~~~~~~~

# Create a file called "pdf_vector_application.py" in /home/oracle location and


copy the below python program code.

cd /home/oracle
vi pdf_vector_application.py

import oracledb
from dotenv import load_dotenv
import os
import requests
import time
from PyPDF2 import PdfReader
import streamlit as st
def call_insert_my_table_row(conn23c, pdf_name, pdf_size, pdf_type, pdf_content):
# Prepare and execute the stored procedure call
try:
cursor = conn23c.cursor()
new_id = cursor.var(oracledb.NUMBER)
cursor.callproc("insert_my_table_row", [pdf_name, pdf_size, pdf_type,
pdf_content, new_id])
conn23c.commit()
print("Procedure executed successfully!"+ str(new_id.getvalue()))
return new_id.getvalue()
except oracledb.DatabaseError as e:
error, = e.args
print ("Error:", error.message)
return None
finally:
cursor.close()

def get_answers(conn23c, user_question, new_id):


# Rest of your code remains unchanged
cursor = conn23c.cursor()
output = cursor.callfunc("generate_text_response_gen", oracledb.CLOB,
[user_question, new_id])
return output

def main():
load_dotenv()
st.set_page_config(page_title="Ask Question Based on PDF")
st.info("Oracle AI Vector Search with OCI GenAI LLM")
st.header("Ask your question to get answers based on your pdf")

username = os.getenv("username")
password = os.getenv("password")
dsn = os.getenv("dsn")

try:
conn23c = oracledb.connect(user=username, password=password, dsn=dsn)
print ("Connection successful!")
except Exception as e:
print ("Connection failed!")

pdf = st.file_uploader("Upload your PDF", type="pdf")


if pdf is not None:
start_time = time.time()
pdf_reader = PdfReader(pdf)
file_name = pdf.name
file_size = pdf.size
file_type = 'application/pdf'
file_content = pdf.getvalue()

# Call procedure to insert PDF into database and get new id


new_id = call_insert_my_table_row(conn23c, file_name, file_size, file_type,
file_content)
print (" the new id is "+ str(new_id))

if new_id is not None:


st.success('File uploaded successfully and inserted into the database.')
else:
st.error('Failed to insert the file into the database.')
user_question = st.text_input("Ask a question about your PDF")
if user_question:
answer = get_answers(conn23c,user_question, new_id)
st.markdown(answer)
elapsed_time = time.time() - start_time
st.caption(f"Total processing time: {round(elapsed_time, 1)} sec.")

if __name__ == '__main__':
main()

Step 14:
~~~~~~~~~
# Open VNC Viewer and login to Oracle OS user.
# Invoke the Python Program with streamlit command.

cd /home/oracle
streamlit run pdf_vector_application.py

Once the streamlit is running, open the mozilla browser inside the VNC Viewer
Then, in the browser, type
localhost:8501/

Note:
Already uploaded "Oracle_Analytics_Server.pdf" file as Doc ID: 1. So, upload
different documents.

Step 15:
~~~~~~~~
# Cleanup the schema and files

drop user baskar cascade;


rm /home/oracle/pdf_vector_application.py
rm /home/oracle/my_embedding_model.onnx

You might also like