Skip to content

Commit bc5cd91

Browse files
committed
CSPro unique rec type, SurveySolutions unique var name
1 parent 4315a0c commit bc5cd91

File tree

6 files changed

+265
-4
lines changed

6 files changed

+265
-4
lines changed

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,11 @@ enableTypeIgnoreComments = true
159159
reportUnusedCallResult = false
160160
reportAny = false
161161

162+
# Disable private usage reporting for test files
163+
[[tool.basedpyright.executionEnvironments]]
164+
root = "tests"
165+
reportPrivateUsage = false
166+
162167
[tool.codespell]
163168
# Add here as needed:
164169
ignore-words-list = "popstan"

src/survaize/surveysolutions/surveysolutions_writer.py

Whitespace-only changes.

src/survaize/writer/cspro_writer.py

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,10 @@ def _generate_data_dictionary(self, questionnaire: Questionnaire) -> tuple[CSPro
131131
id_items.append(id_item)
132132

133133
# Create records for each section (excluding empty sections)
134+
# Record types are assigned sequentially (A, B, C, ..., Z, AA, AB, AC, ...)
135+
# to ensure uniqueness regardless of section numbering scheme
134136
records: list[DictionaryRecord] = []
137+
record_type_counter = 0 # Counter for generating unique record types
135138
for section in questionnaire.sections:
136139
# Create a record for this section with non-ID questions
137140
record_items: list[DictionaryItem] = []
@@ -153,10 +156,14 @@ def _generate_data_dictionary(self, questionnaire: Questionnaire) -> tuple[CSPro
153156

154157
# Only create record if it has items
155158
if record_items:
159+
# Generate unique record type: A, B, C, ..., Z, then AA, AB, etc.
160+
record_type = self._generate_record_type(record_type_counter)
161+
record_type_counter += 1
162+
156163
record = DictionaryRecord(
157164
name=self._to_dictionary_name(f"{section.id}_REC"),
158165
labels=[DictionaryLabel(text=self._to_dictionary_label(section.number, section.id))],
159-
recordType=section.number[0], # Use first letter of section number as record type
166+
recordType=record_type,
160167
items=record_items,
161168
occurrences=DictionaryRecordOccurrences(required=False, maximum=section.occurrences),
162169
)
@@ -808,3 +815,39 @@ def _replace_suffix(self, string: str, old_suffix: str, new_suffix: str) -> str:
808815
if string.endswith(old_suffix):
809816
return string[: -len(old_suffix)] + new_suffix
810817
return string
818+
819+
def _generate_record_type(self, index: int) -> str:
820+
"""Generate a unique record type for the given index.
821+
822+
This method generates record types in the following pattern:
823+
A, B, C, ..., Z (26 records)
824+
AA, AB, AC, ..., AZ (26 records)
825+
BA, BB, BC, ..., BZ (26 records)
826+
And so on...
827+
828+
This provides virtually unlimited unique record types while keeping them
829+
as short as possible.
830+
831+
Args:
832+
index: Zero-based index for the record
833+
834+
Returns:
835+
Record type string (1-2 characters)
836+
"""
837+
if index < 26:
838+
# Use single letters A-Z for first 26 records
839+
return chr(ord("A") + index)
840+
else:
841+
# For index >= 26, use multi-character format
842+
# Convert to base-26 representation using letters
843+
adjusted_index = index - 26
844+
845+
# Calculate the number of characters needed
846+
# First 26 positions (0-25) use 2 characters: AA-AZ, BA-BZ, etc.
847+
first_char_index = adjusted_index // 26
848+
second_char_index = adjusted_index % 26
849+
850+
first_char = chr(ord("A") + first_char_index)
851+
second_char = chr(ord("A") + second_char_index)
852+
853+
return first_char + second_char

src/survaize/writer/surveysolutions_writer.py

Lines changed: 27 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,10 @@
4343
class SurveySolutionsWriter(Writer):
4444
"""Writer that converts Survaize questionnaires to Survey Solutions JSON format."""
4545

46+
def __init__(self) -> None:
47+
"""Initialize the writer with a set to track used variable names."""
48+
self._used_variable_names: set[str] = set()
49+
4650
@override
4751
def write(self, questionnaire: Questionnaire, output_path: Path) -> None:
4852
"""Write a questionnaire to Survey Solutions backup format (zip file).
@@ -53,6 +57,9 @@ def write(self, questionnaire: Questionnaire, output_path: Path) -> None:
5357
"""
5458
logger.info(f"Converting questionnaire '{questionnaire.title}' to Survey Solutions format")
5559

60+
# Reset used variable names for this conversion
61+
self._used_variable_names.clear()
62+
5663
# Convert Survaize questionnaire to Survey Solutions format
5764
ss_questionnaire = self._convert_questionnaire(questionnaire)
5865

@@ -189,18 +196,19 @@ def _convert_question(self, question: Question) -> QuestionElement:
189196
)
190197

191198
def generate_variable_name(self, name: str) -> str:
192-
"""Generate a valid Survey Solutions variable name from a string.
199+
"""Generate a valid and unique Survey Solutions variable name from a string.
193200
194201
Survey Solutions variable names must:
195202
- Start with a letter
196203
- Contain only letters, numbers, and underscores
197204
- Be no longer than 32 characters
205+
- Be unique within the questionnaire
198206
199207
Args:
200208
name: The source name
201209
202210
Returns:
203-
A valid variable name
211+
A valid and unique variable name
204212
"""
205213
# Remove special characters and replace spaces with underscores
206214
variable_name = re.sub(r"[^a-zA-Z0-9_]", "_", name)
@@ -213,11 +221,27 @@ def generate_variable_name(self, name: str) -> str:
213221
if not variable_name:
214222
variable_name = "Variable"
215223

216-
# Truncate to 32 characters
224+
# Truncate to 32 characters initially
217225
if len(variable_name) > 32:
218226
variable_name = variable_name[:32]
219227

220228
# Remove trailing underscores
221229
variable_name = variable_name.rstrip("_")
222230

231+
# Ensure uniqueness by adding a suffix if needed
232+
original_name = variable_name
233+
counter = 1
234+
while variable_name in self._used_variable_names:
235+
# Calculate how much space we need for the suffix
236+
suffix = f"_{counter}"
237+
max_base_length = 32 - len(suffix)
238+
239+
# Truncate the base name if needed to make room for suffix
240+
base_name = original_name[:max_base_length].rstrip("_")
241+
variable_name = base_name + suffix
242+
counter += 1
243+
244+
# Add to used names set
245+
self._used_variable_names.add(variable_name)
246+
223247
return variable_name

tests/test_cspro_writer.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,55 @@ def test_cspro_writer_generates_expected_files(tmp_path: Path) -> None:
4545
f"Unexpected generated files: {sorted(generated_files - fixture_files)}; "
4646
f"missing files: {sorted(fixture_files - generated_files)}"
4747
)
48+
49+
50+
def test_generate_record_type() -> None:
51+
"""Test that _generate_record_type generates unique record types correctly."""
52+
writer = CSProWriter()
53+
54+
# Test first 26 records should be A-Z
55+
for i in range(26):
56+
expected = chr(ord("A") + i)
57+
actual = writer._generate_record_type(i)
58+
assert actual == expected, f"Index {i}: expected {expected}, got {actual}"
59+
60+
# Test next 26 records should be AA-AZ
61+
for i in range(26, 52):
62+
second_char = chr(ord("A") + (i - 26))
63+
expected = f"A{second_char}"
64+
actual = writer._generate_record_type(i)
65+
assert actual == expected, f"Index {i}: expected {expected}, got {actual}"
66+
67+
# Test next 26 records should be BA-BZ
68+
for i in range(52, 78):
69+
second_char = chr(ord("A") + (i - 52))
70+
expected = f"B{second_char}"
71+
actual = writer._generate_record_type(i)
72+
assert actual == expected, f"Index {i}: expected {expected}, got {actual}"
73+
74+
# Test specific edge cases
75+
test_cases = [
76+
(0, "A"), # First record
77+
(25, "Z"), # Last single character
78+
(26, "AA"), # First double character
79+
(51, "AZ"), # Last A* record
80+
(52, "BA"), # First B* record
81+
(77, "BZ"), # Last B* record
82+
(78, "CA"), # First C* record
83+
(100, "CW"), # Random middle case: 100-26=74, 74//26=2 (C), 74%26=22 (W)
84+
(701, "ZZ"), # Last double character: 701-26=675, 675//26=25 (Z), 675%26=25 (Z)
85+
]
86+
87+
for index, expected in test_cases:
88+
actual = writer._generate_record_type(index)
89+
assert actual == expected, f"Index {index}: expected {expected}, got {actual}"
90+
91+
# Test uniqueness for a large range
92+
record_types: set[str] = set()
93+
for i in range(702): # Test up to ZZ (last 2-character combination)
94+
record_type = writer._generate_record_type(i)
95+
assert record_type not in record_types, f"Duplicate record type {record_type} at index {i}"
96+
record_types.add(record_type)
97+
98+
# Verify we have exactly 702 unique record types (26 + 26*26)
99+
assert len(record_types) == 702

tests/test_surveysolutions_writer.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,3 +144,140 @@ def test_variable_name_generation() -> None:
144144
# Test long name (should be truncated to 32 chars)
145145
long_name = "a" * 40
146146
assert len(writer.generate_variable_name(long_name)) == 32
147+
148+
149+
def test_variable_name_uniqueness() -> None:
150+
"""Test that variable names are unique even when they would be the same after truncation."""
151+
writer = SurveySolutionsWriter()
152+
153+
# Create names that would be identical after truncation
154+
long_name_1 = "a" * 35 + "xyz" # Will be truncated to first 32 chars
155+
long_name_2 = "a" * 35 + "abc" # Will be truncated to same first 32 chars
156+
157+
var1 = writer.generate_variable_name(long_name_1)
158+
var2 = writer.generate_variable_name(long_name_2)
159+
160+
# Both should be 32 characters or less
161+
assert len(var1) <= 32
162+
assert len(var2) <= 32
163+
164+
# They should be different
165+
assert var1 != var2
166+
167+
# The second one should have a suffix
168+
assert var2.endswith("_1")
169+
170+
171+
def test_variable_name_uniqueness_with_short_names() -> None:
172+
"""Test that short duplicate names get unique suffixes."""
173+
writer = SurveySolutionsWriter()
174+
175+
# Generate same name multiple times
176+
var1 = writer.generate_variable_name("test")
177+
var2 = writer.generate_variable_name("test")
178+
var3 = writer.generate_variable_name("test")
179+
180+
# All should be different
181+
assert var1 == "test"
182+
assert var2 == "test_1"
183+
assert var3 == "test_2"
184+
185+
186+
def test_variable_name_uniqueness_reset_between_questionnaires() -> None:
187+
"""Test that variable name tracking is reset between questionnaires."""
188+
writer = SurveySolutionsWriter()
189+
190+
# First questionnaire
191+
questions1: list[Question] = [
192+
TextQuestion(
193+
number="1",
194+
id="test",
195+
text="Test question",
196+
type=QuestionType.TEXT,
197+
instructions=None,
198+
universe=None,
199+
max_length=None,
200+
)
201+
]
202+
203+
section1 = Section(
204+
id="section1",
205+
number="A",
206+
title="Section 1",
207+
description="Test section",
208+
questions=questions1,
209+
occurrences=1,
210+
universe=None,
211+
)
212+
213+
questionnaire1 = Questionnaire(
214+
title="Test 1",
215+
description="First test questionnaire",
216+
id_fields=["test"],
217+
sections=[section1],
218+
trailing_sections=[],
219+
)
220+
221+
# Write first questionnaire
222+
with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as temp_file:
223+
temp_path = Path(temp_file.name)
224+
225+
try:
226+
writer.write(questionnaire1, temp_path)
227+
228+
# Second questionnaire with same variable name
229+
questions2: list[Question] = [
230+
TextQuestion(
231+
number="1",
232+
id="test",
233+
text="Test question",
234+
type=QuestionType.TEXT,
235+
instructions=None,
236+
universe=None,
237+
max_length=None,
238+
)
239+
]
240+
241+
section2 = Section(
242+
id="section1",
243+
number="A",
244+
title="Section 1",
245+
description="Test section",
246+
questions=questions2,
247+
occurrences=1,
248+
universe=None,
249+
)
250+
251+
questionnaire2 = Questionnaire(
252+
title="Test 2",
253+
description="Second test questionnaire",
254+
id_fields=["test"],
255+
sections=[section2],
256+
trailing_sections=[],
257+
)
258+
259+
# Write second questionnaire
260+
with tempfile.NamedTemporaryFile(suffix=".zip", delete=False) as temp_file2:
261+
temp_path2 = Path(temp_file2.name)
262+
263+
try:
264+
writer.write(questionnaire2, temp_path2)
265+
266+
# Variable names should be reset, so both should use "test" without suffix
267+
# Read both files and verify the variable names
268+
with zipfile.ZipFile(temp_path, "r") as zip_file:
269+
content1 = json.loads(zip_file.read("document.json"))
270+
271+
with zipfile.ZipFile(temp_path2, "r") as zip_file:
272+
content2 = json.loads(zip_file.read("document.json"))
273+
274+
# Both should have the same variable name since tracking was reset
275+
var_name1 = content1["Children"][0]["Children"][0]["VariableName"]
276+
var_name2 = content2["Children"][0]["Children"][0]["VariableName"]
277+
assert var_name1 == "test"
278+
assert var_name2 == "test"
279+
280+
finally:
281+
temp_path2.unlink(missing_ok=True)
282+
finally:
283+
temp_path.unlink(missing_ok=True)

0 commit comments

Comments
 (0)