diff options
Diffstat (limited to 'helpers/df_helpers.py')
-rw-r--r-- | helpers/df_helpers.py | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/helpers/df_helpers.py b/helpers/df_helpers.py new file mode 100644 index 0000000..b241df5 --- /dev/null +++ b/helpers/df_helpers.py @@ -0,0 +1,71 @@ +import uuid +import pandas as pd +import numpy as np +from .prompts import extractConcepts +from .prompts import graphPrompt + + +def documents2Dataframe(documents) -> pd.DataFrame: + rows = [] + for chunk in documents: + row = { + "text": chunk.page_content, + **chunk.metadata, + "chunk_id": uuid.uuid4().hex, + } + rows = rows + [row] + + df = pd.DataFrame(rows) + return df + + +def df2ConceptsList(dataframe: pd.DataFrame) -> list: + # dataframe.reset_index(inplace=True) + results = dataframe.apply( + lambda row: extractConcepts( + row.text, {"chunk_id": row.chunk_id, "type": "concept"} + ), + axis=1, + ) + # invalid json results in NaN + results = results.dropna() + results = results.reset_index(drop=True) + + ## Flatten the list of lists to one single list of entities. + concept_list = np.concatenate(results).ravel().tolist() + return concept_list + + +def concepts2Df(concepts_list) -> pd.DataFrame: + ## Remove all NaN entities + concepts_dataframe = pd.DataFrame(concepts_list).replace(" ", np.nan) + concepts_dataframe = concepts_dataframe.dropna(subset=["entity"]) + concepts_dataframe["entity"] = concepts_dataframe["entity"].apply( + lambda x: x.lower() + ) + + return concepts_dataframe + + +def df2Graph(dataframe: pd.DataFrame, model=None) -> list: + # dataframe.reset_index(inplace=True) + results = dataframe.apply( + lambda row: graphPrompt(row.text, {"chunk_id": row.chunk_id}, model), axis=1 + ) + # invalid json results in NaN + results = results.dropna() + results = results.reset_index(drop=True) + + ## Flatten the list of lists to one single list of entities. + concept_list = np.concatenate(results).ravel().tolist() + return concept_list + + +def graph2Df(nodes_list) -> pd.DataFrame: + ## Remove all NaN entities + graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan) + graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"]) + graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower()) + graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower()) + + return graph_dataframe |