Skip to content

Commit 9fe073d

Browse files
committed
New method for automagically determining file context
The Coder class has been enriched with two new methods: `auto_file_context` and `get_model_token_limit`. The former method generates a file context based on the user's recent activity, while the latter retrieves the token limit for a given model. Additionally, the `get_llm_model_name` method has been refactored to accommodate these changes. Tests have been added to ensure the correct functionality of these new features.
1 parent 79917e2 commit 9fe073d

File tree

2 files changed

+150
-25
lines changed

2 files changed

+150
-25
lines changed

aicodebot/coder.py

Lines changed: 115 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,59 @@ class Coder:
2020

2121
UNKNOWN_FILE_TYPE = "unknown"
2222

23+
@staticmethod
24+
def auto_file_context(max_tokens, max_file_tokens):
25+
"""Automatically generate a file context based on what we think the user is working on"""
26+
files_to_include = []
27+
file_scores = {}
28+
29+
# To determine the pool of possible files, we start with files that have been recently commmited
30+
possible_files = Coder.git_recent_commited_files()
31+
32+
# then we add any staged and unstaged files
33+
possible_files += Coder.git_staged_files()
34+
possible_files += Coder.git_unstaged_files()
35+
36+
for file in possible_files:
37+
# Skip directories and empty files
38+
file_status = Path(file).stat()
39+
if Path(file).is_dir() or file_status.st_size == 0:
40+
continue
41+
42+
# Get the modification and access times
43+
modification_time = file_status.st_mtime
44+
access_time = file_status.st_atime
45+
46+
# Skip binary files
47+
if Coder.is_binary_file(file):
48+
continue
49+
50+
# Calculate the score based on the modification and access times
51+
# For now, we'll just add the two times together, giving a slight preference to modification time
52+
score = modification_time + (access_time * 0.9)
53+
54+
# Store the score in the dictionary
55+
# Store the file without the directory
56+
file_scores[str(file)] = score
57+
58+
# Sort the files by score in descending order
59+
sorted_files = sorted(file_scores, key=file_scores.get, reverse=True)
60+
61+
# Add files to the list until we reach the max_tokens limit
62+
for file in sorted_files:
63+
token_length = Coder.get_token_length(Path(file).read_text())
64+
if token_length > max_file_tokens:
65+
continue
66+
67+
if token_length <= max_tokens:
68+
files_to_include.append(file)
69+
max_tokens -= token_length
70+
71+
if max_tokens <= 0:
72+
break
73+
74+
return files_to_include
75+
2376
@staticmethod
2477
def clone_repo(repo_url, repo_dir):
2578
"""Clones a git repository from the provided URL to the specified directory.
@@ -179,7 +232,23 @@ def get_llm_headers():
179232
return None
180233

181234
@staticmethod
182-
def get_llm_model_name(token_size=0):
235+
def get_model_token_limit(model_name):
236+
model_token_limits = {
237+
"openai/gpt-4": 8192,
238+
"openai/gpt-4-32k": 32768,
239+
"anthropic/claude-2": 100_000,
240+
"gpt-4": 8192,
241+
"gpt-4-32k": 32768,
242+
"gpt-3.5-turbo": 4096,
243+
"gpt-3.5-turbo-16k": 16384,
244+
}
245+
if model_name in model_token_limits:
246+
return model_token_limits[model_name]
247+
else:
248+
raise ValueError(f"Model {model_name} not found")
249+
250+
@staticmethod
251+
def get_llm_model_name(token_size=0, biggest_available=False):
183252
"""Gets the name of the model to use for the specified token size."""
184253
config = read_config()
185254
if os.getenv("AICODEBOT_LLM_MODEL"):
@@ -189,38 +258,45 @@ def get_llm_model_name(token_size=0):
189258
return os.getenv("AICODEBOT_LLM_MODEL")
190259

191260
if "openrouter_api_key" in config:
192-
model_options = {
193-
"openai/gpt-4": 8192,
194-
"openai/gpt-4-32k": 32768,
195-
"anthropic/claude-2": 100_000,
196-
}
197-
198-
supported_engines = model_options.keys()
261+
model_options = supported_engines = ["openai/gpt-4", "openai/gpt-4-32k"]
199262
else:
200-
model_options = {
201-
"gpt-4": 8192,
202-
"gpt-4-32k": 32768,
203-
"gpt-3.5-turbo": 4096,
204-
"gpt-3.5-turbo-16k": 16384,
205-
}
263+
model_options = ["gpt-4", "gpt-4-32k", "gpt-3.5-turbo", "gpt-3.5-turbo-16k"]
206264
# Pull the list of supported engines from the OpenAI API for this key
207265
supported_engines = Coder.get_openai_supported_engines()
208266

209-
# For some unknown reason, tiktoken often underestimates the token size by ~5%, so let's buffer
210-
token_size = int(token_size * 1.05)
267+
if biggest_available:
268+
# For some tasks we want to use the biggest model we can, only using gpt 3.5 if we have to
269+
biggest_choices = [
270+
"anthropic/claude-2",
271+
"gpt-4-32k",
272+
"openai/gpt-4-32k",
273+
"gpt-4",
274+
"openai/gpt-4",
275+
"gpt-3.5-turbo-16k",
276+
"gpt-3.5-turbo",
277+
]
278+
for model in biggest_choices:
279+
if model in supported_engines:
280+
logger.info(f"Using {model} for biggest available model")
281+
return model
282+
283+
else:
284+
# For some unknown reason, tiktoken often underestimates the token size by ~5%, so let's buffer
285+
token_size = int(token_size * 1.05)
211286

212-
for model, max_tokens in model_options.items():
213-
if model in supported_engines and token_size <= max_tokens:
214-
logger.info(f"Using {model} for token size {token_size}")
215-
return model
287+
for model_name in model_options:
288+
max_tokens = Coder.get_model_token_limit(model_name)
289+
if model_name in supported_engines and token_size <= max_tokens:
290+
logger.info(f"Using {model_name} for token size {token_size}")
291+
return model_name
216292

217-
logger.critical(
218-
f"The context is too large ({token_size}) for any of the models supported by your API key. 😞"
219-
)
220-
if "openrouter_api_key" not in config:
221293
logger.critical(
222-
"If you provide an Open Router API key, you can access larger models, up to 100k tokens"
294+
f"The context is too large ({token_size}) for any of the models supported by your API key. 😞"
223295
)
296+
if "openrouter_api_key" not in config:
297+
logger.critical(
298+
"If you provide an Open Router API key, you can access larger models, up to 100k tokens"
299+
)
224300

225301
return None
226302

@@ -290,6 +366,20 @@ def git_diff_context(commit=None, files=None):
290366

291367
return "\n".join(diffs)
292368

369+
@staticmethod
370+
def git_recent_commited_files(max_files=10, max_commits=3):
371+
"""Get a list of files that have been in the last max_days days."""
372+
recent_commits = exec_and_get_output(["git", "log", "--format=%H", f"-{max_commits}"]).splitlines()
373+
if not recent_commits:
374+
return []
375+
else:
376+
# Get the list of files that have been changed in those commits
377+
out = set()
378+
for commit in recent_commits:
379+
out.update(exec_and_get_output(["git", "diff", "--name-only", commit]).splitlines())
380+
381+
return list(out)[:max_files]
382+
293383
@staticmethod
294384
def git_staged_files():
295385
return exec_and_get_output(["git", "diff", "--cached", "--name-only"]).splitlines()

tests/test_coder.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,44 @@
11
from aicodebot.coder import Coder
22
from aicodebot.helpers import create_and_write_file
33
from pathlib import Path
4+
from tests.conftest import in_temp_directory
45
import os, pytest
56

67

8+
def test_auto_file_context(temp_git_repo):
9+
# Change the current directory to the temporary git repository
10+
11+
with in_temp_directory(temp_git_repo.working_dir):
12+
# Create some test files in the repository
13+
create_and_write_file("file1.txt", "This is a test file.")
14+
create_and_write_file("file2.txt", "This is another test file.")
15+
create_and_write_file("file3.txt", "This is yet another test file.")
16+
17+
assert len(Coder.auto_file_context(1000, 500)) == 0
18+
19+
# Commit the files
20+
temp_git_repo.git.add(".")
21+
temp_git_repo.git.commit("-m", "Add test files")
22+
23+
assert len(Coder.auto_file_context(1000, 500)) == 3
24+
25+
# Create an old file, it should not be included because it's not in git, not staged, etc.
26+
create_and_write_file("file5.txt", "This is an old test file.", overwrite=True)
27+
# Set the atime and the mtime to 10 days ago
28+
created = os.stat("file5.txt").st_mtime # noqa: PTH116
29+
ten_days_ago = created - (10 * 24 * 60 * 60)
30+
os.utime("file5.txt", (ten_days_ago, ten_days_ago))
31+
32+
# Create a new file, and stage it
33+
create_and_write_file("file4.txt", "This is a new test file.")
34+
assert len(Coder.auto_file_context(1000, 500)) == 3
35+
36+
temp_git_repo.git.add("file4.txt")
37+
assert len(Coder.auto_file_context(1000, 500)) == 4
38+
39+
assert len(Coder.auto_file_context(1000, 500)) == 4
40+
41+
742
def test_generate_directory_structure(tmp_path):
843
# Create a file, a hidden file, another file, a .gitignore file, and a subdirectory in the temporary directory
944
create_and_write_file(tmp_path / "file.txt", "This is a test file")

0 commit comments

Comments
 (0)