Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
<junit.version>5.13.2</junit.version>
<slf4j-simple.version>2.0.17</slf4j-simple.version>
<apache-commons-collections4.version>4.5.0</apache-commons-collections4.version>
<externalsortinginjava.version>0.6.2</externalsortinginjava.version>
<maven-compiler-plugin.version>3.14.0</maven-compiler-plugin.version>
<maven-dependency-plugin.version>3.8.1</maven-dependency-plugin.version>
<maven-source-plugin.version>3.3.1</maven-source-plugin.version>
Expand Down Expand Up @@ -231,6 +232,13 @@
<version>${tableschema-java-version}</version>
</dependency>

<!-- Sorting -->
<dependency>
<groupId>com.google.code.externalsortinginjava</groupId>
<artifactId>externalsortinginjava</artifactId>
<version>${externalsortinginjava.version}</version>
</dependency>

<!-- Unit Testing -->
<dependency>
<groupId>org.junit.jupiter</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.code.externalsorting.ExternalSort;
import io.frictionlessdata.datapackage.Dialect;
import io.frictionlessdata.datapackage.JSONBase;
import io.frictionlessdata.datapackage.Package;
Expand All @@ -14,10 +15,7 @@
import io.frictionlessdata.datapackage.exceptions.DataPackageValidationException;
import io.frictionlessdata.datapackage.fk.PackageForeignKey;
import io.frictionlessdata.tableschema.Table;
import io.frictionlessdata.tableschema.exception.ForeignKeyException;
import io.frictionlessdata.tableschema.exception.JsonSerializingException;
import io.frictionlessdata.tableschema.exception.TableIOException;
import io.frictionlessdata.tableschema.exception.TypeInferringException;
import io.frictionlessdata.tableschema.exception.*;
import io.frictionlessdata.tableschema.field.Field;
import io.frictionlessdata.tableschema.fk.ForeignKey;
import io.frictionlessdata.tableschema.io.FileReference;
Expand All @@ -40,6 +38,7 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.stream.Collectors;

/**
* Abstract base implementation of a Resource.
Expand Down Expand Up @@ -368,6 +367,7 @@ public List<Table> getTables() throws Exception {
return tables;
}

@Override
public void checkRelations(Package pkg) {
if (null != schema) {
List<PackageForeignKey> fks = new ArrayList<>();
Expand Down Expand Up @@ -439,6 +439,71 @@ public void checkRelations(Package pkg) {
}
}

@Override
public void checkPrimaryKeys() {
if (null != schema) {
Object pkObj = schema.getPrimaryKey();
if (pkObj == null) {
return; // no primary key defined
}

// Normalize PK fields
String[] pkFields;
if (pkObj instanceof String) {
pkFields = new String[]{(String) pkObj};
} else if (pkObj instanceof String[]) {
pkFields = (String[]) pkObj;
} else {
throw new PrimaryKeyException("Unsupported primary key type: " + pkObj.getClass());
}

try {
// Dump all keys to a temporary file
Path tempFile = Files.createTempFile("pk-check", ".txt");
try (BufferedWriter writer = Files.newBufferedWriter(tempFile)) {
List<Object> data = this.getData(true, false, true, false);
for (Object d : data) {
Map<String, Object> row = (Map<String, Object>) d;
String key = Arrays.stream(pkFields)
.map(f -> String.valueOf(row.get(f)))
.collect(Collectors.joining("\t"));
writer.write(key);
writer.newLine();
}
}

// Use ExternalSort to sort the file
File inputFile = tempFile.toFile();
File sortedFile = Files.createTempFile("pk-check-sorted", ".txt").toFile();

List<File> tempChunks = ExternalSort.sortInBatch(inputFile);
ExternalSort.mergeSortedFiles(tempChunks, sortedFile);

// Scan sorted file line-by-line for duplicates
try (BufferedReader reader = new BufferedReader(new FileReader(sortedFile, StandardCharsets.UTF_8))) {
String prev = null;
String line;
while ((line = reader.readLine()) != null) {
if (line.equals(prev)) {
throw new PrimaryKeyException(
"Primary key violation in resource '" + this.getName() +
"': duplicate key " + line
);
}
prev = line;
}
}

// Cleanup
Files.deleteIfExists(tempFile);
Files.deleteIfExists(sortedFile.toPath());

} catch (Exception e) {
throw new PrimaryKeyException("Error validating primary keys: " + e.getMessage());
}
}
}

public void validate(Package pkg) {

try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,8 @@ static ResourceBuilder builder(String resourceName) {

void checkRelations(Package pkg) throws Exception;

void checkPrimaryKeys() throws Exception;

/**
* Recreate a Resource object from a JSON descriptor, a base path to resolve relative file paths against
* and a flag that tells us whether we are reading from inside a ZIP archive.
Expand Down
71 changes: 71 additions & 0 deletions src/test/java/io/frictionlessdata/datapackage/PrimaryKeysTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.frictionlessdata.datapackage;

import io.frictionlessdata.datapackage.resource.Resource;
import io.frictionlessdata.tableschema.exception.PrimaryKeyException;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;

import java.nio.file.Path;

import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertEquals;

public class PrimaryKeysTest {

@Test
@DisplayName("Test the uniqueness of simple primary keys - invalid case")
void testPrimaryKeysUniqueInvalid() throws Exception {
Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/simple/primary_keys_csv_invalid.json");
Package pkg = new Package(resourcePath, true);
Resource teams = pkg.getResource("teams");

Throwable ex = assertThrows(Exception.class, teams::checkPrimaryKeys);
assertInstanceOf(PrimaryKeyException.class, ex);
assertEquals("Error validating primary keys: Primary key violation in resource 'teams': duplicate key 1", ex.getMessage());
}

@Test
@DisplayName("Test the uniqueness of simple primary keys - valid case")
void testPrimaryKeysUniqueValid() throws Exception {
Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/simple/primary_keys_csv_valid.json");
Package pkg = new Package(resourcePath, true);
Resource teams = pkg.getResource("teams");

assertDoesNotThrow(teams::checkPrimaryKeys);
}

@Test
@DisplayName("Test the uniqueness of composite primary keys - invalid case")
void testCompositePrimaryKeysUniqueInvalid() throws Exception {
Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/composite/primary_keys_csv_invalid.json");
Package pkg = new Package(resourcePath, true);
Resource teams = pkg.getResource("teams");

Throwable ex = assertThrows(Exception.class, teams::checkPrimaryKeys);
assertInstanceOf(PrimaryKeyException.class, ex);
assertEquals("Error validating primary keys: Primary key violation in resource 'teams': duplicate key UK\tLondon", ex.getMessage());
}

@Test
@DisplayName("Test the uniqueness of composite primary keys - valid case")
void testCompositePrimaryKeysUniqueValid() throws Exception {
Path resourcePath = TestUtil.getResourcePath("/fixtures/datapackages/primary-keys/composite/primary_keys_csv_valid.json");
Package pkg = new Package(resourcePath, true);
Resource teams = pkg.getResource("teams");

assertDoesNotThrow(teams::checkPrimaryKeys);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,10 @@ public String getSerializationFormat() {
public void checkRelations(Package aPackage) throws Exception {
}

@Override
public void checkPrimaryKeys() throws Exception {
}

@Override
public void validate(Package aPackage) {
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"name": "foreign-keys",
"resources": [
{
"name": "teams",
"profile": "tabular-data-resource",
"encoding": "UTF-8",
"format": "csv",
"schema": {
"fields": [
{
"name": "name",
"type": "string"
},
{
"name": "country",
"type": "string"
},
{
"name": "city",
"type": "string"
}
],
"primaryKey": ["country", "city"]
},
"path": "teams.csv"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"name": "foreign-keys",
"resources": [
{
"name": "teams",
"profile": "tabular-data-resource",
"encoding": "UTF-8",
"format": "csv",
"schema": {
"fields": [
{
"name": "name",
"type": "string"
},
{
"name": "country",
"type": "string"
},
{
"name": "city",
"type": "string"
}
],
"primaryKey": ["country", "city"]
},
"path": "teams-valid.csv"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
name,country,city
Arsenal,UK,London
Real,Spain,Madrid
Bayern,Germany,Munich
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name,country,city
Arsenal,UK,London
Real,Spain,Madrid
Bayern,Germany,Munich
Chelsea,UK,London
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"name": "foreign-keys",
"resources": [
{
"name": "teams",
"profile": "tabular-data-resource",
"encoding": "UTF-8",
"format": "csv",
"schema": {
"fields": [
{
"name": "id",
"type": "integer",
"constraints": {
"required": true,
"unique": true
}
},
{
"name": "name",
"type": "string"
},
{
"name": "city",
"type": "string"
}
],
"primaryKey": "id"
},
"path": "teams.csv"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"name": "foreign-keys",
"resources": [
{
"name": "teams",
"profile": "tabular-data-resource",
"encoding": "UTF-8",
"format": "csv",
"schema": {
"fields": [
{
"name": "id",
"type": "integer",
"constraints": {
"required": true,
"unique": true
}
},
{
"name": "name",
"type": "string"
},
{
"name": "city",
"type": "string"
}
],
"primaryKey": "id"
},
"path": "teams-valid.csv"
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
id,name,city
1,Arsenal,London
2,Real,Madrid
3,Bayern,Munich
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
id,name,city
1,Arsenal,London
1,Real,Madrid
1,Bayern,Munich