Skip to content

Commit fee5905

Browse files
authored
feat: Add ExcelToolkit (camel-ai#1739)
1 parent 5e21fe7 commit fee5905

File tree

12 files changed

+1745
-110
lines changed

12 files changed

+1745
-110
lines changed

.github/workflows/documentation.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ jobs:
2626

2727
- name: Sphinx build
2828
run: |
29+
source .venv/bin/activate
2930
cd docs
3031
make html
3132

camel/toolkits/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,15 @@
4343
from .notion_toolkit import NotionToolkit
4444
from .human_toolkit import HumanToolkit
4545
from .stripe_toolkit import StripeToolkit
46-
from .video_toolkit import VideoDownloaderToolkit
46+
from .video_download_toolkit import VideoDownloaderToolkit
4747
from .dappier_toolkit import DappierToolkit
4848
from .networkx_toolkit import NetworkXToolkit
4949
from .semantic_scholar_toolkit import SemanticScholarToolkit
5050
from .zapier_toolkit import ZapierToolkit
5151
from .sympy_toolkit import SymPyToolkit
5252
from .mineru_toolkit import MinerUToolkit
53+
from .excel_toolkit import ExcelToolkit
54+
from .video_analysis_toolkit import VideoAnalysisToolkit
5355
from .image_analysis_toolkit import ImageAnalysisToolkit
5456

5557

@@ -89,5 +91,7 @@
8991
'ZapierToolkit',
9092
'SymPyToolkit',
9193
'MinerUToolkit',
94+
'ExcelToolkit',
95+
'VideoAnalysisToolkit',
9296
'ImageAnalysisToolkit',
9397
]

camel/toolkits/excel_toolkit.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. =========
14+
15+
from typing import List
16+
17+
import pandas as pd
18+
19+
from camel.logger import get_logger
20+
from camel.toolkits.base import BaseToolkit
21+
from camel.toolkits.function_tool import FunctionTool
22+
23+
logger = get_logger(__name__)
24+
25+
26+
class ExcelToolkit(BaseToolkit):
27+
r"""A class representing a toolkit for extract detailed cell information
28+
from an Excel file.
29+
30+
This class provides method for processing docx, pdf, pptx, etc. It cannot
31+
process excel files.
32+
"""
33+
34+
def _convert_to_markdown(self, df: pd.DataFrame) -> str:
35+
r"""Convert DataFrame to Markdown format table.
36+
37+
Args:
38+
df (pd.DataFrame): DataFrame containing the Excel data.
39+
40+
Returns:
41+
str: Markdown formatted table.
42+
"""
43+
from tabulate import tabulate
44+
45+
md_table = tabulate(df, headers='keys', tablefmt='pipe')
46+
return str(md_table)
47+
48+
def extract_excel_content(self, document_path: str) -> str:
49+
r"""Extract detailed cell information from an Excel file, including
50+
multiple sheets.
51+
52+
Args:
53+
document_path (str): The path of the Excel file.
54+
55+
Returns:
56+
str: Extracted excel information, including details of each sheet.
57+
"""
58+
from openpyxl import load_workbook
59+
from xls2xlsx import XLS2XLSX
60+
61+
logger.debug(
62+
f"Calling extract_excel_content with document_path"
63+
f": {document_path}"
64+
)
65+
66+
if not (
67+
document_path.endswith("xls")
68+
or document_path.endswith("xlsx")
69+
or document_path.endswith("csv")
70+
):
71+
logger.error("Only xls, xlsx, csv files are supported.")
72+
return (
73+
f"Failed to process file {document_path}: "
74+
f"It is not excel format. Please try other ways."
75+
)
76+
77+
if document_path.endswith("csv"):
78+
try:
79+
df = pd.read_csv(document_path)
80+
md_table = self._convert_to_markdown(df)
81+
return f"CSV File Processed:\n{md_table}"
82+
except Exception as e:
83+
logger.error(f"Failed to process file {document_path}: {e}")
84+
return f"Failed to process file {document_path}: {e}"
85+
86+
if document_path.endswith("xls"):
87+
output_path = document_path.replace(".xls", ".xlsx")
88+
x2x = XLS2XLSX(document_path)
89+
x2x.to_xlsx(output_path)
90+
document_path = output_path
91+
92+
# Load the Excel workbook
93+
wb = load_workbook(document_path, data_only=True)
94+
sheet_info_list = []
95+
96+
# Iterate through all sheets
97+
for sheet in wb.sheetnames:
98+
ws = wb[sheet]
99+
cell_info_list = []
100+
101+
for row in ws.iter_rows():
102+
for cell in row:
103+
row_num = cell.row
104+
col_letter = cell.column_letter
105+
106+
cell_value = cell.value
107+
108+
font_color = None
109+
if (
110+
cell.font
111+
and cell.font.color
112+
and "rgb=None" not in str(cell.font.color)
113+
): # Handle font color
114+
font_color = cell.font.color.rgb
115+
116+
fill_color = None
117+
if (
118+
cell.fill
119+
and cell.fill.fgColor
120+
and "rgb=None" not in str(cell.fill.fgColor)
121+
): # Handle fill color
122+
fill_color = cell.fill.fgColor.rgb
123+
124+
cell_info_list.append(
125+
{
126+
"index": f"{row_num}{col_letter}",
127+
"value": cell_value,
128+
"font_color": font_color,
129+
"fill_color": fill_color,
130+
}
131+
)
132+
133+
# Convert the sheet to a DataFrame and then to markdown
134+
sheet_df = pd.read_excel(
135+
document_path, sheet_name=sheet, engine='openpyxl'
136+
)
137+
markdown_content = self._convert_to_markdown(sheet_df)
138+
139+
# Collect all information for the sheet
140+
sheet_info = {
141+
"sheet_name": sheet,
142+
"cell_info_list": cell_info_list,
143+
"markdown_content": markdown_content,
144+
}
145+
sheet_info_list.append(sheet_info)
146+
147+
result_str = ""
148+
for sheet_info in sheet_info_list:
149+
result_str += f"""
150+
Sheet Name: {sheet_info['sheet_name']}
151+
Cell information list:
152+
{sheet_info['cell_info_list']}
153+
154+
Markdown View of the content:
155+
{sheet_info['markdown_content']}
156+
157+
{'-'*40}
158+
"""
159+
160+
return result_str
161+
162+
def get_tools(self) -> List[FunctionTool]:
163+
r"""Returns a list of FunctionTool objects representing the functions
164+
in the toolkit.
165+
166+
Returns:
167+
List[FunctionTool]: A list of FunctionTool objects representing
168+
the functions in the toolkit.
169+
"""
170+
return [
171+
FunctionTool(self.extract_excel_content),
172+
]

0 commit comments

Comments
 (0)