Merge pull request #59 from mabel-dev/v0.0.228

joocer · web-flow · commit 5a6b7b63ed69 · 2025-10-19T23:12:53.000+01:00
0.0.228
diff --git a/orso/dataframe.py b/orso/dataframe.py
@@ -342,14 +342,16 @@ def display(
     ) -> str:
         from .display import ascii_table
 
-        return ascii_table(
+        table_output, displayed_row_count = ascii_table(
             self,
             limit=limit,
             display_width=display_width,
             max_column_width=max_column_width,
             colorize=colorize,
             show_types=show_types,
+            return_row_count=True,
         )
+        return table_output + f"\n[ {displayed_row_count} rows x {self.columncount} columns ]"
 
     def markdown(self, limit: int = 5, max_column_width: int = 30) -> str:
         from .display import markdown
diff --git a/orso/display.py b/orso/display.py
@@ -297,13 +297,25 @@ def position(value, width, left=True):
                 parts.append("0s")
             value = f"\001INTERVALm{' '.join(parts)}\001OFFm"
             return trunc_printable(value, width)
+        if isinstance(value, (list, tuple)):
             # Check if this is an interval represented as [days, microseconds]
-            if (
+            # This could be:
+            # 1. Explicitly typed as INTERVAL
+            # 2. An ARRAY<INTEGER> with exactly 2 elements that might be an interval
+            is_potential_interval = False
+
+            if type_ and "INTERVAL" in str(type_):
+                is_potential_interval = True
+            elif (
                 type_
-                and "INTERVAL" in str(type_)
+                and "ARRAY<INTEGER>" in str(type_)
                 and len(value) == 2
                 and all(isinstance(v, (int, str)) for v in value)
             ):
+                # Heuristic: ARRAY<INTEGER> with 2 elements might be [days, microseconds]
+                is_potential_interval = True
+
+            if is_potential_interval:
                 try:
                     days = int(str(value[0]))  # Handle both int and string values
                     microseconds = int(str(value[1]))
@@ -328,7 +340,7 @@ def position(value, width, left=True):
 
                     formatted_interval = f"\001INTERVALm{' '.join(parts)}\001OFFm"
                     return trunc_printable(formatted_interval, width)
-                except (ValueError, TypeError) as e:
+                except (ValueError, TypeError):
                     # Fall back to regular list formatting if conversion fails
                     pass
 
@@ -465,11 +477,11 @@ def _inner():
         else:
             for i, row in enumerate(t):
                 displayed_rows += 1
-                if top_and_tail and (table.rowcount > 2 * limit):
-                    if i == limit:
-                        yield "\001PUNCm...\001OFFm"
-                    if i >= limit:
-                        i += t.rowcount - (2 * limit)
+
+                # Handle top_and_tail display
+                if top_and_tail and (table.rowcount > 2 * limit) and i == limit:
+                    yield "\001PUNCm...\001OFFm"
+
                 formatted = [type_formatter(v, w, t) for v, w, t in zip(row, col_width, col_types)]
                 yield (
                     "│\001TYPEm"
diff --git a/orso/types.py b/orso/types.py
@@ -66,6 +66,60 @@ def _parse_type(type_str: str) -> Union[str, Tuple[str, Tuple[int, ...]]]:
     return type_str.upper()
 
 
+def get_orso_type(type_str: str) -> "OrsoTypes":
+    """
+    Convert a type string to an OrsoType enum value with full type information.
+
+    This function parses a type string and returns an OrsoType enum value with
+    all relevant attributes set (precision, scale, length, element types).
+
+    Parameters:
+        type_str (str): The type definition string (e.g., 'INTEGER', 'ARRAY<INTEGER>', 'DECIMAL(10,2)').
+
+    Returns:
+        OrsoTypes: The corresponding OrsoType enum value with all attributes properly set.
+
+    Raises:
+        ValueError: If the type string is not recognized.
+
+    Examples:
+        >>> t = get_orso_type("INTEGER")
+        >>> t == OrsoTypes.INTEGER
+        True
+
+        >>> t = get_orso_type("DECIMAL(10,2)")
+        >>> t._precision
+        10
+        >>> t._scale
+        2
+
+        >>> t = get_orso_type("VARCHAR[255]")
+        >>> t._length
+        255
+
+        >>> t = get_orso_type("ARRAY<INTEGER>")
+        >>> t._element_type == OrsoTypes.INTEGER
+        True
+    """
+    if not type_str:
+        raise ValueError("Type string cannot be empty")
+
+    # Use the existing from_name method which handles all type attributes
+    _type, _length, _precision, _scale, _element_type = OrsoTypes.from_name(type_str)
+
+    if _type == 0 or _type is None:
+        raise ValueError(f"Unknown type '{type_str}'")
+
+    # Attach all the metadata to the returned type instance
+    # The __init__ method initializes these as None, so we just update them
+    object.__setattr__(_type, "_length", _length)
+    object.__setattr__(_type, "_precision", _precision)
+    object.__setattr__(_type, "_scale", _scale)
+    object.__setattr__(_type, "_element_type", _element_type)
+
+    return _type
+
+
 class OrsoTypes(str, Enum):
     """
     The names of the types supported by Orso
diff --git a/orso/version.py b/orso/version.py
@@ -10,5 +10,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__: str = "0.0.228-beta.1"
+__version__: str = "0.0.228"
 __author__: str = "@joocer"
diff --git a/tests/test_display.py b/tests/test_display.py
@@ -1,25 +1,27 @@
 import os
 import sys
+import datetime as dt
 
 sys.path.insert(1, os.path.join(sys.path[0], ".."))
 
 from orso.dataframe import DataFrame
+from orso.types import get_orso_type, OrsoTypes
 from tests import cities
 import re
 from typing import List
 
 lengths = {
-    0: 5,
-    1: 6,
-    2: 7,
-    3: 8,
-    4: 9,
-    5: 10,
-    6: 11,
-    7: 12,
-    8: 12,
-    9: 12,
-    10: 12,
+    0: 6,  # Updated: now includes footer line
+    1: 7,
+    2: 8,
+    3: 9,
+    4: 10,
+    5: 11,
+    6: 12,
+    7: 13,
+    8: 13,
+    9: 13,
+    10: 13,
 }
 
 
@@ -56,10 +58,177 @@ def test_display_ascii_greedy():
         df = DataFrame(cities.values).head(i)
         df.materialize()
 
-        ascii = df.display(limit=3, show_types=True)
-
-        assert len(ascii.split("\n")) == lengths[i], i
-        assert len(find_all_substrings(ascii, "Tokyo")) == (1 if i != 0 else 0)
+        ascii_output = df.display(limit=3, show_types=True)
+
+        assert len(ascii_output.split("\n")) == lengths[i], i
+        assert len(find_all_substrings(ascii_output, "Tokyo")) == (1 if i != 0 else 0)
+
+
+def test_row_count_footer_single_row():
+    """Test that row count footer is accurate for a single row DataFrame"""
+    df = DataFrame([{"a": 1, "b": 2}])
+    output = df.display()
+    # Should show "[ 1 rows x 2 columns ]" in the footer
+    assert "[ 1 rows x 2 columns ]" in output
+
+
+def test_row_count_footer_multiple_rows():
+    """Test that row count footer is accurate for multiple rows"""
+    data = [{"a": i, "b": i * 2} for i in range(10)]
+    df = DataFrame(data)
+    output = df.display()
+    # Should show "[ 10 rows x 2 columns ]" in the footer
+    assert "[ 10 rows x 2 columns ]" in output
+
+
+def test_row_count_footer_lazy_dataframe():
+    """Test that row count footer is accurate for lazy (generator-based) DataFrames"""
+    data = [{"a": i, "b": i * 2} for i in range(50)]
+    df = DataFrame(data)
+    output = df.display(limit=5)
+    # With top_and_tail enabled, display shows 5 head + 5 tail = 10 rows
+    # So the footer should show [ 10 rows x 2 columns ] for the displayed subset
+    # NOT the original 50 rows  
+    assert "[ 10 rows x 2 columns ]" in output
+
+
+def test_row_indices_consistency():
+    """Test that row indices are consistent and sequential"""
+    data = [{"a": i, "b": i * 2} for i in range(20)]
+    df = DataFrame(data)
+    output = df.display(limit=5)
+    
+    # Extract row indices from the display
+    # Remove ANSI color codes for easier parsing
+    ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+    clean_output = ansi_escape.sub('', output)
+    
+    lines = clean_output.split("\n")
+    row_indices = []
+    for line in lines:
+        # Look for lines with │ that contain data rows (not header/footer)
+        if "│" in line and "─" not in line and "═" not in line and "┌" not in line:
+            # Try to extract the first number after the first │
+            parts = line.split("│")
+            if len(parts) > 1:
+                try:
+                    idx = int(parts[1].strip())
+                    row_indices.append(idx)
+                except (ValueError, IndexError):
+                    pass
+    
+    # With top_and_tail, should display head (1-5) + tail (6-10)
+    # Note: The tail rows show indices 6-10 from enumeration, not the actual row numbers
+    assert len(row_indices) >= 10, f"Expected at least 10 row indices, got {len(row_indices)}"
+    assert row_indices[:5] == [1, 2, 3, 4, 5], f"Expected [1,2,3,4,5], got {row_indices[:5]}"
+    # The last 5 should be 6-10 (tail enumeration)
+    assert row_indices[-5:] == [6, 7, 8, 9, 10], f"Expected [6,7,8,9,10], got {row_indices[-5:]}"
+
+
+def test_interval_formatting_from_array():
+    """Test that intervals represented as [days, microseconds] arrays are handled"""
+    # Create a DataFrame with interval-like data
+    # [0, 36000000000] microseconds = 10 hours
+    data = [{"interval": ["0", "36000000000"]}]
+    df = DataFrame(data)
+    output = df.display()
+    
+    # The interval heuristic checks for ARRAY<INTEGER> with 2 elements,
+    # but raw data will default to unknown types. The display should still work,
+    # it just may not format as an interval.
+    # Check that the display includes the interval column
+    assert "interval" in output
+
+
+def test_get_orso_type_parser_simple_types():
+    """Test the get_orso_type parser with simple type strings"""
+    assert get_orso_type("INTEGER") == OrsoTypes.INTEGER
+    assert get_orso_type("VARCHAR") == OrsoTypes.VARCHAR
+    assert get_orso_type("DOUBLE") == OrsoTypes.DOUBLE
+    assert get_orso_type("BOOLEAN") == OrsoTypes.BOOLEAN
+    assert get_orso_type("DATE") == OrsoTypes.DATE
+    assert get_orso_type("TIMESTAMP") == OrsoTypes.TIMESTAMP
+    assert get_orso_type("INTERVAL") == OrsoTypes.INTERVAL
+
+
+def test_get_orso_type_parser_complex_types():
+    """Test the get_orso_type parser with complex type strings"""
+    assert get_orso_type("ARRAY<INTEGER>") == OrsoTypes.ARRAY
+    assert get_orso_type("ARRAY<VARCHAR>") == OrsoTypes.ARRAY
+    assert get_orso_type("VARCHAR[255]") == OrsoTypes.VARCHAR
+    assert get_orso_type("DECIMAL(10,2)") == OrsoTypes.DECIMAL
+    assert get_orso_type("BLOB[1024]") == OrsoTypes.BLOB
+
+
+def test_get_orso_type_parser_case_insensitive():
+    """Test that get_orso_type is case-insensitive"""
+    assert get_orso_type("integer") == OrsoTypes.INTEGER
+    assert get_orso_type("INTEGER") == OrsoTypes.INTEGER
+    assert get_orso_type("InTeGeR") == OrsoTypes.INTEGER
+    assert get_orso_type("array<integer>") == OrsoTypes.ARRAY
+    assert get_orso_type("ARRAY<INTEGER>") == OrsoTypes.ARRAY
+
+
+def test_get_orso_type_parser_invalid_type():
+    """Test that get_orso_type raises ValueError for invalid types"""
+    try:
+        get_orso_type("INVALID_TYPE")
+        assert False, "Should have raised ValueError"
+    except ValueError as e:
+        assert "Unknown" in str(e)
+
+
+def test_display_with_mixed_data_types():
+    """Test display with mixed data types to ensure no regressions"""
+    data = [
+        {
+            "int_col": 42,
+            "float_col": 3.14,
+            "str_col": "hello",
+            "bool_col": True,
+            "date_col": dt.date(2025, 10, 19),
+        }
+    ]
+    df = DataFrame(data)
+    output = df.display()
+    
+    # All columns should be present in output
+    assert "int_col" in output
+    assert "float_col" in output
+    assert "str_col" in output
+    assert "bool_col" in output
+    assert "date_col" in output
+    assert "42" in output
+    assert "3.14" in output
+    assert "hello" in output
+    assert "2025-10-19" in output
+
+
+def test_display_with_null_values():
+    """Test that null values are displayed correctly"""
+    data = [
+        {"a": 1, "b": None},
+        {"a": None, "b": 2},
+    ]
+    df = DataFrame(data)
+    output = df.display()
+    
+    # Should show "null" for None values
+    assert "null" in output
+    assert "2 rows" in output
+
+
+def test_display_footer_rows_columns_format():
+    """Test that the footer format is consistent"""
+    data = [{"x": i, "y": i * 2, "z": i * 3} for i in range(5)]
+    df = DataFrame(data)
+    output = df.display()
+    
+    # Should end with footer in format "[ N rows x M columns ]"
+    assert "[ 5 rows x 3 columns ]" in output
+    # Verify it's in the last line
+    last_line = output.split("\n")[-1]
+    assert "[ 5 rows x 3 columns ]" in last_line
 
 
 
diff --git a/tests/test_types_parser.py b/tests/test_types_parser.py