@@ -20,6 +20,59 @@ class Coder:
20
20
21
21
UNKNOWN_FILE_TYPE = "unknown"
22
22
23
+ @staticmethod
24
+ def auto_file_context (max_tokens , max_file_tokens ):
25
+ """Automatically generate a file context based on what we think the user is working on"""
26
+ files_to_include = []
27
+ file_scores = {}
28
+
29
+ # To determine the pool of possible files, we start with files that have been recently commmited
30
+ possible_files = Coder .git_recent_commited_files ()
31
+
32
+ # then we add any staged and unstaged files
33
+ possible_files += Coder .git_staged_files ()
34
+ possible_files += Coder .git_unstaged_files ()
35
+
36
+ for file in possible_files :
37
+ # Skip directories and empty files
38
+ file_status = Path (file ).stat ()
39
+ if Path (file ).is_dir () or file_status .st_size == 0 :
40
+ continue
41
+
42
+ # Get the modification and access times
43
+ modification_time = file_status .st_mtime
44
+ access_time = file_status .st_atime
45
+
46
+ # Skip binary files
47
+ if Coder .is_binary_file (file ):
48
+ continue
49
+
50
+ # Calculate the score based on the modification and access times
51
+ # For now, we'll just add the two times together, giving a slight preference to modification time
52
+ score = modification_time + (access_time * 0.9 )
53
+
54
+ # Store the score in the dictionary
55
+ # Store the file without the directory
56
+ file_scores [str (file )] = score
57
+
58
+ # Sort the files by score in descending order
59
+ sorted_files = sorted (file_scores , key = file_scores .get , reverse = True )
60
+
61
+ # Add files to the list until we reach the max_tokens limit
62
+ for file in sorted_files :
63
+ token_length = Coder .get_token_length (Path (file ).read_text ())
64
+ if token_length > max_file_tokens :
65
+ continue
66
+
67
+ if token_length <= max_tokens :
68
+ files_to_include .append (file )
69
+ max_tokens -= token_length
70
+
71
+ if max_tokens <= 0 :
72
+ break
73
+
74
+ return files_to_include
75
+
23
76
@staticmethod
24
77
def clone_repo (repo_url , repo_dir ):
25
78
"""Clones a git repository from the provided URL to the specified directory.
@@ -179,7 +232,23 @@ def get_llm_headers():
179
232
return None
180
233
181
234
@staticmethod
182
- def get_llm_model_name (token_size = 0 ):
235
+ def get_model_token_limit (model_name ):
236
+ model_token_limits = {
237
+ "openai/gpt-4" : 8192 ,
238
+ "openai/gpt-4-32k" : 32768 ,
239
+ "anthropic/claude-2" : 100_000 ,
240
+ "gpt-4" : 8192 ,
241
+ "gpt-4-32k" : 32768 ,
242
+ "gpt-3.5-turbo" : 4096 ,
243
+ "gpt-3.5-turbo-16k" : 16384 ,
244
+ }
245
+ if model_name in model_token_limits :
246
+ return model_token_limits [model_name ]
247
+ else :
248
+ raise ValueError (f"Model { model_name } not found" )
249
+
250
+ @staticmethod
251
+ def get_llm_model_name (token_size = 0 , biggest_available = False ):
183
252
"""Gets the name of the model to use for the specified token size."""
184
253
config = read_config ()
185
254
if os .getenv ("AICODEBOT_LLM_MODEL" ):
@@ -189,38 +258,45 @@ def get_llm_model_name(token_size=0):
189
258
return os .getenv ("AICODEBOT_LLM_MODEL" )
190
259
191
260
if "openrouter_api_key" in config :
192
- model_options = {
193
- "openai/gpt-4" : 8192 ,
194
- "openai/gpt-4-32k" : 32768 ,
195
- "anthropic/claude-2" : 100_000 ,
196
- }
197
-
198
- supported_engines = model_options .keys ()
261
+ model_options = supported_engines = ["openai/gpt-4" , "openai/gpt-4-32k" ]
199
262
else :
200
- model_options = {
201
- "gpt-4" : 8192 ,
202
- "gpt-4-32k" : 32768 ,
203
- "gpt-3.5-turbo" : 4096 ,
204
- "gpt-3.5-turbo-16k" : 16384 ,
205
- }
263
+ model_options = ["gpt-4" , "gpt-4-32k" , "gpt-3.5-turbo" , "gpt-3.5-turbo-16k" ]
206
264
# Pull the list of supported engines from the OpenAI API for this key
207
265
supported_engines = Coder .get_openai_supported_engines ()
208
266
209
- # For some unknown reason, tiktoken often underestimates the token size by ~5%, so let's buffer
210
- token_size = int (token_size * 1.05 )
267
+ if biggest_available :
268
+ # For some tasks we want to use the biggest model we can, only using gpt 3.5 if we have to
269
+ biggest_choices = [
270
+ "anthropic/claude-2" ,
271
+ "gpt-4-32k" ,
272
+ "openai/gpt-4-32k" ,
273
+ "gpt-4" ,
274
+ "openai/gpt-4" ,
275
+ "gpt-3.5-turbo-16k" ,
276
+ "gpt-3.5-turbo" ,
277
+ ]
278
+ for model in biggest_choices :
279
+ if model in supported_engines :
280
+ logger .info (f"Using { model } for biggest available model" )
281
+ return model
282
+
283
+ else :
284
+ # For some unknown reason, tiktoken often underestimates the token size by ~5%, so let's buffer
285
+ token_size = int (token_size * 1.05 )
211
286
212
- for model , max_tokens in model_options .items ():
213
- if model in supported_engines and token_size <= max_tokens :
214
- logger .info (f"Using { model } for token size { token_size } " )
215
- return model
287
+ for model_name in model_options :
288
+ max_tokens = Coder .get_model_token_limit (model_name )
289
+ if model_name in supported_engines and token_size <= max_tokens :
290
+ logger .info (f"Using { model_name } for token size { token_size } " )
291
+ return model_name
216
292
217
- logger .critical (
218
- f"The context is too large ({ token_size } ) for any of the models supported by your API key. 😞"
219
- )
220
- if "openrouter_api_key" not in config :
221
293
logger .critical (
222
- "If you provide an Open Router API key, you can access larger models, up to 100k tokens "
294
+ f"The context is too large ( { token_size } ) for any of the models supported by your API key. 😞 "
223
295
)
296
+ if "openrouter_api_key" not in config :
297
+ logger .critical (
298
+ "If you provide an Open Router API key, you can access larger models, up to 100k tokens"
299
+ )
224
300
225
301
return None
226
302
@@ -290,6 +366,20 @@ def git_diff_context(commit=None, files=None):
290
366
291
367
return "\n " .join (diffs )
292
368
369
+ @staticmethod
370
+ def git_recent_commited_files (max_files = 10 , max_commits = 3 ):
371
+ """Get a list of files that have been in the last max_days days."""
372
+ recent_commits = exec_and_get_output (["git" , "log" , "--format=%H" , f"-{ max_commits } " ]).splitlines ()
373
+ if not recent_commits :
374
+ return []
375
+ else :
376
+ # Get the list of files that have been changed in those commits
377
+ out = set ()
378
+ for commit in recent_commits :
379
+ out .update (exec_and_get_output (["git" , "diff" , "--name-only" , commit ]).splitlines ())
380
+
381
+ return list (out )[:max_files ]
382
+
293
383
@staticmethod
294
384
def git_staged_files ():
295
385
return exec_and_get_output (["git" , "diff" , "--cached" , "--name-only" ]).splitlines ()
0 commit comments