CSPro unique rec type, SurveySolutions unique var name

jhandley · jhandley · commit bc5cd918154f · 2025-07-14T22:59:20.000-04:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -159,6 +159,11 @@ enableTypeIgnoreComments = true
 reportUnusedCallResult = false
 reportAny = false
 
+# Disable private usage reporting for test files
+[[tool.basedpyright.executionEnvironments]]
+root = "tests"
+reportPrivateUsage = false
+
 [tool.codespell]
 # Add here as needed:
 ignore-words-list = "popstan"
diff --git a/src/survaize/surveysolutions/surveysolutions_writer.py b/src/survaize/surveysolutions/surveysolutions_writer.py
diff --git a/src/survaize/writer/cspro_writer.py b/src/survaize/writer/cspro_writer.py
@@ -131,7 +131,10 @@ def _generate_data_dictionary(self, questionnaire: Questionnaire) -> tuple[CSPro
                     id_items.append(id_item)
 
         # Create records for each section (excluding empty sections)
+        # Record types are assigned sequentially (A, B, C, ..., Z, AA, AB, AC, ...)
+        # to ensure uniqueness regardless of section numbering scheme
         records: list[DictionaryRecord] = []
+        record_type_counter = 0  # Counter for generating unique record types
         for section in questionnaire.sections:
             # Create a record for this section with non-ID questions
             record_items: list[DictionaryItem] = []
@@ -153,10 +156,14 @@ def _generate_data_dictionary(self, questionnaire: Questionnaire) -> tuple[CSPro
 
             # Only create record if it has items
             if record_items:
+                # Generate unique record type: A, B, C, ..., Z, then AA, AB, etc.
+                record_type = self._generate_record_type(record_type_counter)
+                record_type_counter += 1
+
                 record = DictionaryRecord(
                     name=self._to_dictionary_name(f"{section.id}_REC"),
                     labels=[DictionaryLabel(text=self._to_dictionary_label(section.number, section.id))],
-                    recordType=section.number[0],  # Use first letter of section number as record type
+                    recordType=record_type,
                     items=record_items,
                     occurrences=DictionaryRecordOccurrences(required=False, maximum=section.occurrences),
                 )
@@ -808,3 +815,39 @@ def _replace_suffix(self, string: str, old_suffix: str, new_suffix: str) -> str:
         if string.endswith(old_suffix):
             return string[: -len(old_suffix)] + new_suffix
         return string
+
+    def _generate_record_type(self, index: int) -> str:
+        """Generate a unique record type for the given index.
+
+        This method generates record types in the following pattern:
+        A, B, C, ..., Z (26 records)
+        AA, AB, AC, ..., AZ (26 records)
+        BA, BB, BC, ..., BZ (26 records)
+        And so on...
+
+        This provides virtually unlimited unique record types while keeping them
+        as short as possible.
+
+        Args:
+            index: Zero-based index for the record
+
+        Returns:
+            Record type string (1-2 characters)
+        """
+        if index < 26:
+            # Use single letters A-Z for first 26 records
+            return chr(ord("A") + index)
+        else:
+            # For index >= 26, use multi-character format
+            # Convert to base-26 representation using letters
+            adjusted_index = index - 26
+
+            # Calculate the number of characters needed
+            # First 26 positions (0-25) use 2 characters: AA-AZ, BA-BZ, etc.
+            first_char_index = adjusted_index // 26
+            second_char_index = adjusted_index % 26
+
+            first_char = chr(ord("A") + first_char_index)
+            second_char = chr(ord("A") + second_char_index)
+
+            return first_char + second_char
diff --git a/src/survaize/writer/surveysolutions_writer.py b/src/survaize/writer/surveysolutions_writer.py
@@ -43,6 +43,10 @@
 class SurveySolutionsWriter(Writer):
     """Writer that converts Survaize questionnaires to Survey Solutions JSON format."""
 
+    def __init__(self) -> None:
+        """Initialize the writer with a set to track used variable names."""
+        self._used_variable_names: set[str] = set()
+
     @override
     def write(self, questionnaire: Questionnaire, output_path: Path) -> None:
         """Write a questionnaire to Survey Solutions backup format (zip file).
@@ -53,6 +57,9 @@ def write(self, questionnaire: Questionnaire, output_path: Path) -> None:
         """
         logger.info(f"Converting questionnaire '{questionnaire.title}' to Survey Solutions format")
 
+        # Reset used variable names for this conversion
+        self._used_variable_names.clear()
+
         # Convert Survaize questionnaire to Survey Solutions format
         ss_questionnaire = self._convert_questionnaire(questionnaire)
 
@@ -189,18 +196,19 @@ def _convert_question(self, question: Question) -> QuestionElement:
             )
 
     def generate_variable_name(self, name: str) -> str:
-        """Generate a valid Survey Solutions variable name from a string.
+        """Generate a valid and unique Survey Solutions variable name from a string.
 
         Survey Solutions variable names must:
         - Start with a letter
         - Contain only letters, numbers, and underscores
         - Be no longer than 32 characters
+        - Be unique within the questionnaire
 
         Args:
             name: The source name
 
         Returns:
-            A valid variable name
+            A valid and unique variable name
         """
         # Remove special characters and replace spaces with underscores
         variable_name = re.sub(r"[^a-zA-Z0-9_]", "_", name)
@@ -213,11 +221,27 @@ def generate_variable_name(self, name: str) -> str:
         if not variable_name:
             variable_name = "Variable"
 
-        # Truncate to 32 characters
+        # Truncate to 32 characters initially
         if len(variable_name) > 32:
             variable_name = variable_name[:32]
 
         # Remove trailing underscores
         variable_name = variable_name.rstrip("_")
 
+        # Ensure uniqueness by adding a suffix if needed
+        original_name = variable_name
+        counter = 1
+        while variable_name in self._used_variable_names:
+            # Calculate how much space we need for the suffix
+            suffix = f"_{counter}"
+            max_base_length = 32 - len(suffix)
+
+            # Truncate the base name if needed to make room for suffix
+            base_name = original_name[:max_base_length].rstrip("_")
+            variable_name = base_name + suffix
+            counter += 1
+
+        # Add to used names set
+        self._used_variable_names.add(variable_name)
+
         return variable_name
diff --git a/tests/test_cspro_writer.py b/tests/test_cspro_writer.py
@@ -45,3 +45,55 @@ def test_cspro_writer_generates_expected_files(tmp_path: Path) -> None:
         f"Unexpected generated files: {sorted(generated_files - fixture_files)}; "
         f"missing files: {sorted(fixture_files - generated_files)}"
     )
+
+
+def test_generate_record_type() -> None:
+    """Test that _generate_record_type generates unique record types correctly."""
+    writer = CSProWriter()
+
+    # Test first 26 records should be A-Z
+    for i in range(26):
+        expected = chr(ord("A") + i)
+        actual = writer._generate_record_type(i)
+        assert actual == expected, f"Index {i}: expected {expected}, got {actual}"
+
+    # Test next 26 records should be AA-AZ
+    for i in range(26, 52):
+        second_char = chr(ord("A") + (i - 26))
+        expected = f"A{second_char}"
+        actual = writer._generate_record_type(i)
+        assert actual == expected, f"Index {i}: expected {expected}, got {actual}"
+
+    # Test next 26 records should be BA-BZ
+    for i in range(52, 78):
+        second_char = chr(ord("A") + (i - 52))
+        expected = f"B{second_char}"
+        actual = writer._generate_record_type(i)
+        assert actual == expected, f"Index {i}: expected {expected}, got {actual}"
+
+    # Test specific edge cases
+    test_cases = [
+        (0, "A"),  # First record
+        (25, "Z"),  # Last single character
+        (26, "AA"),  # First double character
+        (51, "AZ"),  # Last A* record
+        (52, "BA"),  # First B* record
+        (77, "BZ"),  # Last B* record
+        (78, "CA"),  # First C* record
+        (100, "CW"),  # Random middle case: 100-26=74, 74//26=2 (C), 74%26=22 (W)
+        (701, "ZZ"),  # Last double character: 701-26=675, 675//26=25 (Z), 675%26=25 (Z)
+    ]
+
+    for index, expected in test_cases:
+        actual = writer._generate_record_type(index)
+        assert actual == expected, f"Index {index}: expected {expected}, got {actual}"
+
+    # Test uniqueness for a large range
+    record_types: set[str] = set()
+    for i in range(702):  # Test up to ZZ (last 2-character combination)
+        record_type = writer._generate_record_type(i)
+        assert record_type not in record_types, f"Duplicate record type {record_type} at index {i}"
+        record_types.add(record_type)
+
+    # Verify we have exactly 702 unique record types (26 + 26*26)
+    assert len(record_types) == 702
diff --git a/tests/test_surveysolutions_writer.py b/tests/test_surveysolutions_writer.py
@@ -144,3 +144,140 @@ def test_variable_name_generation() -> None:
     # Test long name (should be truncated to 32 chars)
     long_name = "a" * 40
     assert len(writer.generate_variable_name(long_name)) == 32
+
+
+def test_variable_name_uniqueness() -> None:
+    """Test that variable names are unique even when they would be the same after truncation."""
+    writer = SurveySolutionsWriter()
+
+    # Create names that would be identical after truncation
+    long_name_1 = "a" * 35 + "xyz"  # Will be truncated to first 32 chars
+    long_name_2 = "a" * 35 + "abc"  # Will be truncated to same first 32 chars
+
+    var1 = writer.generate_variable_name(long_name_1)
+    var2 = writer.generate_variable_name(long_name_2)
+
+    # Both should be 32 characters or less
+    assert len(var1) <= 32
+    assert len(var2) <= 32
+
+    # They should be different
+    assert var1 != var2
+
+    # The second one should have a suffix
+    assert var2.endswith("_1")
+
+
+def test_variable_name_uniqueness_with_short_names() -> None:
+    """Test that short duplicate names get unique suffixes."""
+    writer = SurveySolutionsWriter()
+
+    # Generate same name multiple times
+    var1 = writer.generate_variable_name("test")
+    var2 = writer.generate_variable_name("test")
+    var3 = writer.generate_variable_name("test")
+
+    # All should be different
+    assert var1 == "test"
+    assert var2 == "test_1"
+    assert var3 == "test_2"
+
+
+def test_variable_name_uniqueness_reset_between_questionnaires() -> None:
+    """Test that variable name tracking is reset between questionnaires."""
+    writer = SurveySolutionsWriter()
+
+    # First questionnaire
+    questions1: list[Question] = [
+        TextQuestion(
+            number="1",
+            id="test",
+            text="Test question",
+            type=QuestionType.TEXT,
+            instructions=None,
+            universe=None,
+            max_length=None,
+        )
+    ]
+
+    section1 = Section(
+        id="section1",
+        number="A",
+        title="Section 1",
+        description="Test section",
+        questions=questions1,
+        occurrences=1,
+        universe=None,
+    )
+
+    questionnaire1 = Questionnaire(
+        title="Test 1",
+        description="First test questionnaire",
+        id_fields=["test"],
+        sections=[section1],
+        trailing_sections=[],
+    )
+
+    # Write first questionnaire
+    with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as temp_file:
+        temp_path = Path(temp_file.name)
+
+    try:
+        writer.write(questionnaire1, temp_path)
+
+        # Second questionnaire with same variable name
+        questions2: list[Question] = [
+            TextQuestion(
+                number="1",
+                id="test",
+                text="Test question",
+                type=QuestionType.TEXT,
+                instructions=None,
+                universe=None,
+                max_length=None,
+            )
+        ]
+
+        section2 = Section(
+            id="section1",
+            number="A",
+            title="Section 1",
+            description="Test section",
+            questions=questions2,
+            occurrences=1,
+            universe=None,
+        )
+
+        questionnaire2 = Questionnaire(
+            title="Test 2",
+            description="Second test questionnaire",
+            id_fields=["test"],
+            sections=[section2],
+            trailing_sections=[],
+        )
+
+        # Write second questionnaire
+        with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as temp_file2:
+            temp_path2 = Path(temp_file2.name)
+
+        try:
+            writer.write(questionnaire2, temp_path2)
+
+            # Variable names should be reset, so both should use "test" without suffix
+            # Read both files and verify the variable names
+            with zipfile.ZipFile(temp_path, "r") as zip_file:
+                content1 = json.loads(zip_file.read("document.json"))
+
+            with zipfile.ZipFile(temp_path2, "r") as zip_file:
+                content2 = json.loads(zip_file.read("document.json"))
+
+            # Both should have the same variable name since tracking was reset
+            var_name1 = content1["Children"][0]["Children"][0]["VariableName"]
+            var_name2 = content2["Children"][0]["Children"][0]["VariableName"]
+            assert var_name1 == "test"
+            assert var_name2 == "test"
+
+        finally:
+            temp_path2.unlink(missing_ok=True)
+    finally:
+        temp_path.unlink(missing_ok=True)