Skip to content

Commit f7dbff5

Browse files
authored
History based reindex vs. git revert (#4609)
avoid processing untouched files when performing history based reindex fixes #4607
1 parent 0d92745 commit f7dbff5

File tree

4 files changed

+201
-4
lines changed

4 files changed

+201
-4
lines changed

opengrok-indexer/src/main/java/org/opengrok/indexer/configuration/RuntimeEnvironment.java

+8
Original file line numberDiff line numberDiff line change
@@ -1921,6 +1921,14 @@ public void maybeRefreshIndexSearchers() {
19211921
stat.report(LOGGER, "Done refreshing searcher managers");
19221922
}
19231923

1924+
@VisibleForTesting
1925+
public void releaseIndexSearchers() throws IOException {
1926+
for (SearcherManager sm : searcherManagerMap.values()) {
1927+
sm.close();
1928+
}
1929+
searcherManagerMap.clear();
1930+
}
1931+
19241932
/**
19251933
* Get IndexSearcher for given project or global IndexSearcher.
19261934
* Wrapper of {@link #getSuperIndexSearcher(String)}. Make sure to release the returned

opengrok-indexer/src/main/java/org/opengrok/indexer/index/IndexDatabase.java

+44-1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
import java.util.ArrayList;
4040
import java.util.Arrays;
4141
import java.util.Comparator;
42+
import java.util.Date;
4243
import java.util.HashMap;
4344
import java.util.HashSet;
4445
import java.util.List;
@@ -959,6 +960,38 @@ boolean getIndexDownArgs(String dir, File sourceRoot, IndexDownArgs args) throws
959960
return historyBased;
960961
}
961962

963+
/**
964+
* @param file file under source root
965+
* @return false if the document date is newer or equal to the last modified time stamp of the file, otherwise true
966+
*/
967+
private static boolean isStrictlyNewerThanDocument(File file) {
968+
if (!file.exists()) {
969+
// Case of delete/renamed file.
970+
return true;
971+
}
972+
try {
973+
Document doc = IndexDatabase.getDocument(file);
974+
if (Objects.isNull(doc)) {
975+
return true;
976+
}
977+
IndexableField field = doc.getField(QueryBuilder.DATE);
978+
try {
979+
Date docDate = DateTools.stringToDate(field.stringValue());
980+
// Assumes millisecond precision.
981+
long lastModified = file.lastModified();
982+
if (lastModified <= docDate.getTime()) {
983+
return false;
984+
}
985+
} catch (java.text.ParseException e) {
986+
return true;
987+
}
988+
} catch (ParseException | IOException e) {
989+
LOGGER.log(Level.FINEST, "cannot get document for ''{0}''", file);
990+
}
991+
992+
return true;
993+
}
994+
962995
/**
963996
* Executes the first, serial stage of indexing, by going through set of files assembled from history.
964997
* @param sourceRoot path to the source root (same as {@link RuntimeEnvironment#getSourceRootPath()})
@@ -982,8 +1015,18 @@ void indexDownUsingHistory(File sourceRoot, IndexDownArgs args) throws IOExcepti
9821015
return;
9831016
}
9841017
File file = new File(sourceRoot, path.toString());
985-
processFileHistoryBased(args, file, path.toString());
9861018
progress.increment();
1019+
//
1020+
// If the changes to the file were nullified across a sequence of changesets, the repository
1021+
// might not have updated the file. The history collector is not that smart however,
1022+
// so handle such situation here.
1023+
//
1024+
if (!isStrictlyNewerThanDocument(file)) {
1025+
LOGGER.log(Level.FINEST, "file ''{0}'' is not newer than its document, skipping",
1026+
new Object[]{file});
1027+
continue;
1028+
}
1029+
processFileHistoryBased(args, file, path.toString());
9871030
}
9881031
}
9891032
}

opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexDatabaseTest.java

+146-3
Original file line numberDiff line numberDiff line change
@@ -43,13 +43,17 @@
4343
import java.util.stream.Stream;
4444

4545
import org.apache.commons.lang3.SystemUtils;
46+
import org.apache.lucene.document.DateTools;
4647
import org.apache.lucene.document.Document;
48+
import org.apache.lucene.index.IndexableField;
4749
import org.apache.lucene.queryparser.classic.ParseException;
4850
import org.apache.lucene.search.ScoreDoc;
4951

5052
import org.eclipse.jgit.api.Git;
5153
import org.eclipse.jgit.api.MergeCommand;
54+
import org.eclipse.jgit.api.errors.GitAPIException;
5255
import org.eclipse.jgit.lib.ObjectId;
56+
import org.eclipse.jgit.revwalk.RevCommit;
5357
import org.junit.jupiter.api.AfterEach;
5458
import org.junit.jupiter.api.BeforeEach;
5559
import org.junit.jupiter.api.Test;
@@ -171,6 +175,7 @@ void setUpClass() throws Exception {
171175

172176
@AfterEach
173177
void tearDownClass() throws Exception {
178+
env.releaseIndexSearchers();
174179
repository.destroy();
175180
}
176181

@@ -312,21 +317,28 @@ void testIndexPath() throws IOException {
312317

313318
@Test
314319
void testGetLastRev() throws IOException, ParseException {
320+
// IndexDatabase.getDocument() searches the index, so refresh the IndexSearcher objects
321+
// to get fresh results.
322+
env.maybeRefreshIndexSearchers();
315323
Document doc = IndexDatabase.getDocument(Paths.get(repository.getSourceRoot(),
316324
"git", "main.c").toFile());
317325
assertNotNull(doc);
318326
assertEquals("aa35c25882b9a60a97758e0ceb276a3f8cb4ae3a", doc.get(QueryBuilder.LASTREV));
319327
}
320328

321-
static void changeFileAndCommit(Git git, File file, String comment) throws Exception {
329+
static RevCommit changeFileAndCommit(Git git, File file, String comment) throws Exception {
322330
String authorName = "Foo Bar";
323331
String authorEmail = "[email protected]";
324332

325333
try (FileOutputStream fos = new FileOutputStream(file, true)) {
326334
fos.write(comment.getBytes(StandardCharsets.UTF_8));
327335
}
328336

329-
git.commit().setMessage(comment).setAuthor(authorName, authorEmail).setAll(true).call();
337+
return commitFile(git, comment, authorName, authorEmail);
338+
}
339+
340+
private static RevCommit commitFile(Git git, String comment, String authorName, String authorEmail) throws GitAPIException {
341+
return git.commit().setMessage(comment).setAuthor(authorName, authorEmail).setAll(true).call();
330342
}
331343

332344
private void addFileAndCommit(Git git, String newFileName, File repositoryRoot, String message) throws Exception {
@@ -338,7 +350,7 @@ private void addFileAndCommit(Git git, String newFileName, File repositoryRoot,
338350
fos.write("foo bar foo bar foo bar".getBytes(StandardCharsets.UTF_8));
339351
}
340352
git.add().addFilepattern(newFileName).call();
341-
git.commit().setMessage(message).setAuthor("foo bar", "[email protected]").setAll(true).call();
353+
commitFile(git, message, "foo bar", "[email protected]");
342354
}
343355

344356
private void addMergeCommit(Git git, File repositoryRoot) throws Exception {
@@ -1019,6 +1031,7 @@ void testAnnotationCacheProjectTunable(boolean useAnnotationCache, boolean isHis
10191031
// cleanup
10201032
gitProject.setHistoryBasedReindex(projectUseAnnotationOrig);
10211033
env.setDataRoot(dataRootOrig);
1034+
env.releaseIndexSearchers();
10221035
IOUtils.removeRecursive(dataRoot);
10231036
}
10241037

@@ -1076,4 +1089,134 @@ void testHistoryCacheForFileBasedRepository() throws Exception {
10761089
assertFalse(otherFile.exists());
10771090
assertFalse(historyGuru.hasHistoryCacheForFile(otherFile));
10781091
}
1092+
1093+
/**
1094+
* When incrementally indexing across Git changesets which modify the same file however the outcome
1095+
* is no change to the file (the changes nullify each other), IndexDatabase needs to filter these files
1096+
* out because Git does it as well. Otherwise, the indexer would attempt to add the document with
1097+
* time stamp of pre-existing document which would make indexing of the related project fail.
1098+
* This test simulates this case.
1099+
* <p>
1100+
* The strategy of this test is as follows:
1101+
* <ol>
1102+
* <li>initialize parent repository</li>
1103+
* <li>change+add file <code>foo.txt</code> in parent repository, commit</li>
1104+
* <li>change+add file <code>bar.txt</code> in parent repository, commit</li>
1105+
* <li>clone parent repository</li>
1106+
* <li>index the clone</li>
1107+
* <li>change <code>foo.txt</code> in parent repository, commit</li>
1108+
* <li>change <code>bar.txt</code> in parent repository, commit</li>
1109+
* <li>revert the change done to foo.txt in the last commit in parent repository</li>
1110+
* <li>pull the changes to the clone</li>
1111+
* <li>index the clone (incremental)</li>
1112+
* </ol>
1113+
* </p>
1114+
* Before the fix, the last reindex resulted in RuntimeException caused by the addition of the <code>foo.txt</code>
1115+
* file with the time stamp of the file before the last changes. This is because history based reindex
1116+
* extracts the list of files from the changesets, however Git does not update the file if the changes
1117+
* were nullified.
1118+
*/
1119+
@Test
1120+
void testNullifiedChanges() throws Exception {
1121+
File parentRepositoryRoot = new File(env.getSourceRootPath(), "gitNoChangeParent");
1122+
assertTrue(parentRepositoryRoot.mkdir());
1123+
1124+
env.setHistoryBasedReindex(true);
1125+
1126+
final String barName = "bar.txt";
1127+
final String repoName = "gitNoChange";
1128+
Path repositoryRootPath = Path.of(env.getSourceRootPath(), repoName);
1129+
List<String> projectList = List.of(File.separator + repoName);
1130+
try (Git gitParent = Git.init().setDirectory(parentRepositoryRoot).call()) {
1131+
// Create initial commits for the files in the parent repository.
1132+
final String fooName = "foo.txt";
1133+
File fooFile = new File(parentRepositoryRoot, fooName);
1134+
if (!fooFile.createNewFile()) {
1135+
throw new IOException("Could not create file " + fooFile);
1136+
}
1137+
gitParent.add().addFilepattern(fooName).call();
1138+
changeFileAndCommit(gitParent, fooFile, "first foo");
1139+
1140+
File barFile = new File(parentRepositoryRoot, barName);
1141+
if (!barFile.createNewFile()) {
1142+
throw new IOException("Could not create file " + barFile);
1143+
}
1144+
gitParent.add().addFilepattern(barName).call();
1145+
changeFileAndCommit(gitParent, barFile, "first bar");
1146+
1147+
// Clone the repository at this point so that subsequent changes can be pulled later on.
1148+
final String cloneUrl = parentRepositoryRoot.toURI().toString();
1149+
try (Git gitClone = Git.cloneRepository()
1150+
.setURI(cloneUrl)
1151+
.setDirectory(repositoryRootPath.toFile())
1152+
.call()) {
1153+
1154+
// Perform initial index. This is important so that history cache for the repository
1155+
// is created. It contains ID of the last indexed changeset which so that it can be
1156+
// used during the final reindex.
1157+
indexer.prepareIndexer(
1158+
env, true, true,
1159+
null, null);
1160+
env.setRepositories(new ArrayList<>(HistoryGuru.getInstance().getRepositories()));
1161+
env.generateProjectRepositoriesMap();
1162+
Project project = Project.getByName(repoName);
1163+
assertNotNull(project);
1164+
List<RepositoryInfo> repositoryInfos = env.getProjectRepositoriesMap().get(project);
1165+
assertEquals(1, repositoryInfos.size());
1166+
assertEquals("git", repositoryInfos.get(0).getType());
1167+
indexer.doIndexerExecution(projectList, null);
1168+
1169+
// Change the parent repository so that it contains nullified change to the foo.txt file.
1170+
final String data = "change foo";
1171+
gitParent.add().addFilepattern(fooName).call();
1172+
RevCommit commit = changeFileAndCommit(gitParent, fooFile, data);
1173+
1174+
// Also throw another file into the mix so that it resembles reality a bit more.
1175+
changeFileAndCommit(gitParent, barFile, "change bar");
1176+
1177+
// Revert the changes done to foo.txt so that the changes got nullified for the subsequent pull.
1178+
gitParent.revert().include(commit).call();
1179+
1180+
// Bring the changes to the repository to be indexed. Again, done for better simulation.
1181+
gitClone.pull().call();
1182+
}
1183+
}
1184+
1185+
// Final reindex. This should discover the changes done to the clone and index them.
1186+
indexer.prepareIndexer(
1187+
env, true, true,
1188+
null, null);
1189+
//
1190+
// Use IndexDatabase instead of indexer.doIndexerExecution(projectList, null) because
1191+
// it will detect the indexing failure via RuntimeException. Also, it will be possible
1192+
// to determine via mocking whether history based reindex was used.
1193+
//
1194+
IndexDownArgsFactory factory = new IndexDownArgsFactory();
1195+
IndexDownArgsFactory spyFactory = spy(factory);
1196+
IndexDownArgs args = new IndexDownArgs();
1197+
// In this case the getIndexDownArgs() should be called from update() just once so this will suffice.
1198+
when(spyFactory.getIndexDownArgs()).thenReturn(args);
1199+
Project project = env.getProjects().get(repoName);
1200+
assertNotNull(project);
1201+
IndexDatabase idbOrig = new IndexDatabase(project, spyFactory);
1202+
assertNotNull(idbOrig);
1203+
IndexDatabase idb = spy(idbOrig);
1204+
idb.update();
1205+
// Verify history based reindex was used.
1206+
checkIndexDown(true, idb);
1207+
1208+
// Check that the document for bar.txt was updated. Serves as a smoke test.
1209+
File barFile = new File(repositoryRootPath.toString(), barName);
1210+
assertTrue(barFile.exists());
1211+
// IndexDatabase.getDocument() performs index search to retrieve the document, so the corresponding
1212+
// IndexSearcher object has to be bumped in order to get fresh document.
1213+
env.maybeRefreshIndexSearchers();
1214+
Document barDoc = IndexDatabase.getDocument(barFile);
1215+
assertNotNull(barDoc);
1216+
IndexableField field = barDoc.getField(QueryBuilder.DATE);
1217+
String docDate = field.stringValue();
1218+
// Need to use the same resolution as in AnalyzerGuru#populateDocument().
1219+
String fileDate = DateTools.timeToString(barFile.lastModified(), DateTools.Resolution.MILLISECOND);
1220+
assertEquals(fileDate, docDate);
1221+
}
10791222
}

opengrok-indexer/src/test/java/org/opengrok/indexer/index/IndexerVsDeletedDocumentsTest.java

+3
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,9 @@ void setup() throws IOException {
199199

200200
@AfterEach
201201
void cleanup() throws IOException {
202+
// Release any references to index files so that it is actually possible to
203+
// remove the index files (under data root) on Windows.
204+
env.releaseIndexSearchers();
202205
IOUtils.removeRecursive(Path.of(env.getDataRootPath()));
203206
// FileUtils.deleteDirectory() avoids AccessDeniedException on Windows.
204207
FileUtils.deleteDirectory(env.getSourceRootFile());

0 commit comments

Comments
 (0)