|
| 1 | +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= |
| 2 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 3 | +# you may not use this file except in compliance with the License. |
| 4 | +# You may obtain a copy of the License at |
| 5 | +# |
| 6 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 7 | +# |
| 8 | +# Unless required by applicable law or agreed to in writing, software |
| 9 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 10 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 11 | +# See the License for the specific language governing permissions and |
| 12 | +# limitations under the License. |
| 13 | +# ========= Copyright 2023-2024 @ CAMEL-AI.org. All Rights Reserved. ========= |
| 14 | + |
| 15 | +from typing import List |
| 16 | + |
| 17 | +import pandas as pd |
| 18 | + |
| 19 | +from camel.logger import get_logger |
| 20 | +from camel.toolkits.base import BaseToolkit |
| 21 | +from camel.toolkits.function_tool import FunctionTool |
| 22 | + |
| 23 | +logger = get_logger(__name__) |
| 24 | + |
| 25 | + |
| 26 | +class ExcelToolkit(BaseToolkit): |
| 27 | + r"""A class representing a toolkit for extract detailed cell information |
| 28 | + from an Excel file. |
| 29 | +
|
| 30 | + This class provides method for processing docx, pdf, pptx, etc. It cannot |
| 31 | + process excel files. |
| 32 | + """ |
| 33 | + |
| 34 | + def _convert_to_markdown(self, df: pd.DataFrame) -> str: |
| 35 | + r"""Convert DataFrame to Markdown format table. |
| 36 | +
|
| 37 | + Args: |
| 38 | + df (pd.DataFrame): DataFrame containing the Excel data. |
| 39 | +
|
| 40 | + Returns: |
| 41 | + str: Markdown formatted table. |
| 42 | + """ |
| 43 | + from tabulate import tabulate |
| 44 | + |
| 45 | + md_table = tabulate(df, headers='keys', tablefmt='pipe') |
| 46 | + return str(md_table) |
| 47 | + |
| 48 | + def extract_excel_content(self, document_path: str) -> str: |
| 49 | + r"""Extract detailed cell information from an Excel file, including |
| 50 | + multiple sheets. |
| 51 | +
|
| 52 | + Args: |
| 53 | + document_path (str): The path of the Excel file. |
| 54 | +
|
| 55 | + Returns: |
| 56 | + str: Extracted excel information, including details of each sheet. |
| 57 | + """ |
| 58 | + from openpyxl import load_workbook |
| 59 | + from xls2xlsx import XLS2XLSX |
| 60 | + |
| 61 | + logger.debug( |
| 62 | + f"Calling extract_excel_content with document_path" |
| 63 | + f": {document_path}" |
| 64 | + ) |
| 65 | + |
| 66 | + if not ( |
| 67 | + document_path.endswith("xls") |
| 68 | + or document_path.endswith("xlsx") |
| 69 | + or document_path.endswith("csv") |
| 70 | + ): |
| 71 | + logger.error("Only xls, xlsx, csv files are supported.") |
| 72 | + return ( |
| 73 | + f"Failed to process file {document_path}: " |
| 74 | + f"It is not excel format. Please try other ways." |
| 75 | + ) |
| 76 | + |
| 77 | + if document_path.endswith("csv"): |
| 78 | + try: |
| 79 | + df = pd.read_csv(document_path) |
| 80 | + md_table = self._convert_to_markdown(df) |
| 81 | + return f"CSV File Processed:\n{md_table}" |
| 82 | + except Exception as e: |
| 83 | + logger.error(f"Failed to process file {document_path}: {e}") |
| 84 | + return f"Failed to process file {document_path}: {e}" |
| 85 | + |
| 86 | + if document_path.endswith("xls"): |
| 87 | + output_path = document_path.replace(".xls", ".xlsx") |
| 88 | + x2x = XLS2XLSX(document_path) |
| 89 | + x2x.to_xlsx(output_path) |
| 90 | + document_path = output_path |
| 91 | + |
| 92 | + # Load the Excel workbook |
| 93 | + wb = load_workbook(document_path, data_only=True) |
| 94 | + sheet_info_list = [] |
| 95 | + |
| 96 | + # Iterate through all sheets |
| 97 | + for sheet in wb.sheetnames: |
| 98 | + ws = wb[sheet] |
| 99 | + cell_info_list = [] |
| 100 | + |
| 101 | + for row in ws.iter_rows(): |
| 102 | + for cell in row: |
| 103 | + row_num = cell.row |
| 104 | + col_letter = cell.column_letter |
| 105 | + |
| 106 | + cell_value = cell.value |
| 107 | + |
| 108 | + font_color = None |
| 109 | + if ( |
| 110 | + cell.font |
| 111 | + and cell.font.color |
| 112 | + and "rgb=None" not in str(cell.font.color) |
| 113 | + ): # Handle font color |
| 114 | + font_color = cell.font.color.rgb |
| 115 | + |
| 116 | + fill_color = None |
| 117 | + if ( |
| 118 | + cell.fill |
| 119 | + and cell.fill.fgColor |
| 120 | + and "rgb=None" not in str(cell.fill.fgColor) |
| 121 | + ): # Handle fill color |
| 122 | + fill_color = cell.fill.fgColor.rgb |
| 123 | + |
| 124 | + cell_info_list.append( |
| 125 | + { |
| 126 | + "index": f"{row_num}{col_letter}", |
| 127 | + "value": cell_value, |
| 128 | + "font_color": font_color, |
| 129 | + "fill_color": fill_color, |
| 130 | + } |
| 131 | + ) |
| 132 | + |
| 133 | + # Convert the sheet to a DataFrame and then to markdown |
| 134 | + sheet_df = pd.read_excel( |
| 135 | + document_path, sheet_name=sheet, engine='openpyxl' |
| 136 | + ) |
| 137 | + markdown_content = self._convert_to_markdown(sheet_df) |
| 138 | + |
| 139 | + # Collect all information for the sheet |
| 140 | + sheet_info = { |
| 141 | + "sheet_name": sheet, |
| 142 | + "cell_info_list": cell_info_list, |
| 143 | + "markdown_content": markdown_content, |
| 144 | + } |
| 145 | + sheet_info_list.append(sheet_info) |
| 146 | + |
| 147 | + result_str = "" |
| 148 | + for sheet_info in sheet_info_list: |
| 149 | + result_str += f""" |
| 150 | + Sheet Name: {sheet_info['sheet_name']} |
| 151 | + Cell information list: |
| 152 | + {sheet_info['cell_info_list']} |
| 153 | + |
| 154 | + Markdown View of the content: |
| 155 | + {sheet_info['markdown_content']} |
| 156 | + |
| 157 | + {'-'*40} |
| 158 | + """ |
| 159 | + |
| 160 | + return result_str |
| 161 | + |
| 162 | + def get_tools(self) -> List[FunctionTool]: |
| 163 | + r"""Returns a list of FunctionTool objects representing the functions |
| 164 | + in the toolkit. |
| 165 | +
|
| 166 | + Returns: |
| 167 | + List[FunctionTool]: A list of FunctionTool objects representing |
| 168 | + the functions in the toolkit. |
| 169 | + """ |
| 170 | + return [ |
| 171 | + FunctionTool(self.extract_excel_content), |
| 172 | + ] |
0 commit comments