Merge pull request #47 from davidanthoff/remove-dataarray-dataframe

davidanthoff · web-flow · commit 731657cca602 · 2018-04-29T17:54:11.000-07:00
Remove dependency on DataArray and DataFrame
diff --git a/LICENSE.md b/LICENSE.md
@@ -1,6 +1,6 @@
 The ExcelReaders.jl package is licensed under the MIT "Expat" License:
 
-> Copyright (c) 2016: David Anthoff.
+> Copyright (c) 2016-2018: David Anthoff.
 >
 > Permission is hereby granted, free of charge, to any person obtaining
 > a copy of this software and associated documentation files (the
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# ExcelReaders.jl v0.9.0 Release Notes
+* Drop support for DataFrames.
+* Use Dates.Time.
+* Use DataValue for missing values.
+* Fix deprecated syntax.
+
 # ExcelReaders.jl v0.8.2 Release Notes
 * Fix bug in readxlsheet
 
diff --git a/README.md b/README.md
@@ -11,6 +11,14 @@
 
 ExcelReaders is a package that provides functionality to read Excel files.
 
+**WARNING**: Version v0.9.0 removed all support for [DataFrames.jl](https://github.com/JuliaData/DataFrames.jl)
+from this package. The [ExcelFiles.jl](https://github.com/davidanthoff/ExcelFiles.jl)
+package now provides functionality to read data from an Excel file into
+a ``DataFrame`` (or any other table type), and users are encouraged to use
+that package for tabular data going forward. Version v0.9.0 also no longer
+uses [DataArrays.jl](https://github.com/JuliaStats/DataArrays.jl), but instead
+is based on [DataValues.jl](https://github.com/davidanthoff/DataValues.jl).
+
 ## Installation
 
 Use ``Pkg.add("ExcelReaders")`` in Julia to install ExcelReaders and its dependencies.
@@ -31,7 +39,7 @@ using ExcelReaders
 data = readxl("Filename.xlsx", "Sheet1!A1:C4")
 ````
 
-This will return a ``DataMatrix{Any}`` with all the data in the cell range A1 to C4 on Sheet1 in the Excel file Filename.xlsx.
+This will return an array with all the data in the cell range A1 to C4 on Sheet1 in the Excel file Filename.xlsx.
 
 If you expect to read multiple ranges from the same Excel file you can get much better performance by opening the Excel file only once:
 
@@ -62,37 +70,3 @@ This will read all content on Sheet1 in the file Filename.xlsx. Eventual blank r
 - ``ncols`` accepts either ``:all`` (default) or a postiive integer. With ``:all``, all columns (except skipped ones) are read. An integer specifies the exact number of columns to be read.
 
 ``readxlsheet`` also accepts an ExcelFile (as obtained from ``openxl``) as its first argument.
-
-## Reading into a DataFrame
-
-To read into a DataFrame:
-
-````julia
-using ExcelReaders
-using DataFrames
-
-df = readxl(DataFrame, "Filename.xlsx", "Sheet1!A1:C4")
-````
-
-This code will use the first row in the range A1:C4 as the column names in the DataFrame.
-
-To read in data without a header row use
-
-````julia
-df = readxl(DataFrame, "Filename.xlsx", "Sheet1!A1:C4", header=false)
-````
-
-This will auto-generate column names. Alternatively you can specify your own names:
-
-````julia
-df = readxl(DataFrame, "Filename.xlsx", "Sheet1!A1:C4",
-            header=false, colnames=[:name1, :name2, :name3])
-````
-
-You can also combine ``header=true`` and a custom ``colnames`` list, in that case the first row in the specified range will just be skipped.
-
-To read the whole sheet into a DataFrame (respective keyword arguments (`header`, `skipstartrows` etc.) should work as expected):
-
-```julia
-df = readxlsheet(DataFrame, "Filename.xlsx", "Sheet1")
-```
diff --git a/REQUIRE b/REQUIRE
@@ -1,4 +1,3 @@
 julia 0.6
-DataArrays
-DataFrames
+DataValues
 PyCall 1.5
diff --git a/src/ExcelReaders.jl b/src/ExcelReaders.jl
@@ -2,9 +2,7 @@ __precompile__()
 
 module ExcelReaders
 
-using PyCall, DataArrays, DataFrames
-
-import Base.show
+using PyCall, DataValues
 
 export openxl, readxl, readxlsheet, ExcelErrorCell, ExcelFile, readxlnames, readxlrange
 
@@ -23,7 +21,7 @@ A handle to an open Excel file.
 
 You can create an instance of an ``ExcelFile`` by calling ``openxl``.
 """
-type ExcelFile
+mutable struct ExcelFile
     workbook::PyObject
     filename::AbstractString
 end
@@ -36,25 +34,21 @@ An Excel cell that has an Excel error.
 You cannot create ``ExcelErrorCell`` objects, they are returned if a cell in an
 Excel file has an Excel error.
 """
-type ExcelErrorCell
+mutable struct ExcelErrorCell
     errorcode::Int
 end
 
-# TODO Remove this type once there is a Time type in Dates
-immutable Time
-    hours::Int
-    minutes::Int
-    seconds::Int
-end
-
-function show(io::IO, o::ExcelFile)
+function Base.show(io::IO, o::ExcelFile)
     print(io, "ExcelFile <$(o.filename)>")
 end
 
-function show(io::IO, o::ExcelErrorCell)
+function Base.show(io::IO, o::ExcelErrorCell)
     print(io, xlrd[:error_text_from_code][o.errorcode])
 end
 
+Base.promote_rule(::Type{DataValue{T}}, ::Type{ExcelErrorCell}) where {T}= Any
+Base.promote_rule(::Type{ExcelErrorCell}, ::Type{DataValue{T}}) where {T} = Any
+
 """
     openxl(filename)
 
@@ -219,7 +213,7 @@ function get_cell_value(ws, row, col, wb)
         elseif celltype == xlrd[:XL_CELL_DATE]
             date_year,date_month,date_day,date_hour,date_minute,date_sec = xlrd[:xldate_as_tuple](cellval, wb[:datemode])
             if date_month==0
-                return Time(date_hour, date_minute, date_sec)
+                return Base.Dates.Time(date_hour, date_minute, date_sec)
             else
                 return DateTime(date_year, date_month, date_day, date_hour, date_minute, date_sec)
             end
@@ -241,7 +235,7 @@ function readxl_internal(file::ExcelFile, sheetname::AbstractString, startrow::I
         return get_cell_value(ws, startrow, startcol, wb)
     else
 
-        data = DataArray(Any, endrow-startrow+1,endcol-startcol+1)
+        data = Array{Any}(endrow-startrow+1,endcol-startcol+1)
 
         for row in startrow:endrow
             for col in startcol:endcol
@@ -253,119 +247,6 @@ function readxl_internal(file::ExcelFile, sheetname::AbstractString, startrow::I
     end
 end
 
-function readxl(::Type{DataFrame}, filename::AbstractString, range::AbstractString; header::Bool=true, colnames::Vector{Symbol}=Symbol[])
-    excelfile = openxl(filename)
-
-    readxl(DataFrame, excelfile, range, header=header, colnames=colnames)
-end
-
-function readxl(::Type{DataFrame}, file::ExcelFile, range::AbstractString; header::Bool=true, colnames::Vector{Symbol}=Symbol[])
-    sheetname, startrow, startcol, endrow, endcol = convert_ref_to_sheet_row_col(range)
-
-    readxl_internal(DataFrame, file, sheetname, startrow, startcol, endrow, endcol, header=header, colnames=colnames)
-end
-
-function readxlsheet(::Type{DataFrame}, filename::AbstractString, sheetindex::Int; header::Bool=true, colnames::Vector{Symbol}=Symbol[], args...)
-    excelfile = openxl(filename)
-    readxlsheet(DataFrame, excelfile, sheetindex; args...)
-end
-
-function readxlsheet(::Type{DataFrame}, excelfile::ExcelFile, sheetindex::Int; header::Bool=true, colnames::Vector{Symbol}=Symbol[], args...)
-    sheetname = excelfile.workbook[:sheet_names]()[sheetindex]
-    readxlsheet(DataFrame, excelfile, sheetname; args...)
-end
-
-function readxlsheet(::Type{DataFrame}, filename::AbstractString, sheetname::AbstractString; header::Bool=true, colnames::Vector{Symbol}=Symbol[], args...)
-    excelfile = openxl(filename)
-    readxlsheet(DataFrame, excelfile, sheetname; header=header, colnames=colnames, args...)
-end
-
-function readxlsheet(::Type{DataFrame}, excelfile::ExcelFile, sheetname::AbstractString; header::Bool=true, colnames::Vector{Symbol}=Symbol[], args...)
-    sheet = excelfile.workbook[:sheet_by_name](sheetname)
-    startrow, startcol, endrow, endcol = convert_args_to_row_col(sheet; args...)
-    readxl_internal(DataFrame, excelfile, sheetname, startrow, startcol, endrow, endcol; header=header, colnames=colnames)
-end
-
-function readxl_internal(::Type{DataFrame}, file::ExcelFile, sheetname::AbstractString, startrow::Int, startcol::Int, endrow::Int, endcol::Int; header::Bool=true, colnames::Vector{Symbol}=Symbol[])
-    data = readxl_internal(file, sheetname, startrow, startcol, endrow, endcol)
-
-    nrow, ncol = size(data)
-
-    if length(colnames)==0
-        if header
-            headervec = data[1, :]
-            NAcol = Bool.(isna.(headervec))
-            headervec[NAcol] = DataFrames.gennames(countnz(NAcol))
-
-            # This somewhat complicated conditional makes sure that column names
-            # that are integer numbers end up without an extra ".0" as their name
-            colnames = [isa(i, AbstractFloat) ? ( modf(i)[1]==0.0 ? Symbol(Int(i)) : Symbol(string(i)) ) : Symbol(i) for i in vec(headervec)]
-        else
-            colnames = DataFrames.gennames(ncol)
-        end
-    elseif length(colnames)!=ncol
-        error("Length of colnames must equal number of columns in selected range")
-    end
-
-    columns = Array{Any}(ncol)
-
-    for i=1:ncol
-        if header
-            vals = data[2:end,i]
-        else
-            vals = data[:,i]
-        end
-
-        # Check whether all non-NA values in this column
-        # are of the same type
-        all_one_type = true
-        found_first_type = false
-        type_of_el = Any
-        NAs_present = false
-        for val=vals
-            if !found_first_type
-                if !isna(val)
-                    type_of_el = typeof(val)
-                    found_first_type = true
-                end
-            elseif !isna(val) && (typeof(val)!=type_of_el)
-                all_one_type = false
-                if NAs_present
-                    break
-                end
-            end
-            if isna(val)
-                NAs_present = true
-                if all_one_type == false
-                    break
-                end
-            end
-        end
-
-        if all_one_type
-            if NAs_present
-                # TODO use the following line instead of the shim once upstream
-                # bug is fixed
-                #columns[i] = convert(DataArray{type_of_el},vals)
-                shim_newarray = DataArray(type_of_el, length(vals))
-                for l=1:length(vals)
-                    shim_newarray[l] = vals[l]
-                end
-                columns[i] = shim_newarray
-            else
-                # TODO Decide whether this should be converted to Array instead of DataArray
-                columns[i] = convert(DataArray{type_of_el},vals)
-            end
-        else
-            columns[i] = vals
-        end
-    end
-
-    df = DataFrame(columns, colnames)
-
-    return df
-end
-
 function readxlnames(f::ExcelFile)
     return [lowercase(i[:name]) for i in f.workbook[:name_obj_list] if i[:hidden]==0]
 end
diff --git a/src/package_documentation.jl b/src/package_documentation.jl
@@ -33,7 +33,7 @@ using ExcelReaders
 data = readxl("Filename.xlsx", "Sheet1!A1:C4")
 ````
 
-This will return a ``DataMatrix{Any}`` with all the data in the cell range A1 to
+This will return an array with all the data in the cell range A1 to
 C4 on Sheet1 in the Excel file Filename.xlsx.
 
 If you expect to read multiple ranges from the same Excel file you can get much
@@ -74,40 +74,5 @@ all columns (except skipped ones) are read. An integer specifies the exact numbe
 
 ``readxlsheet`` also accepts an ``ExcelFile`` (as obtained from ``openxl``) as its
 first argument.
-
-## Reading into a DataFrame
-
-To read into a DataFrame:
-
-````julia
-using ExcelReaders, DataFrames
-df = readxl(DataFrame, "Filename.xlsx", "Sheet1!A1:C4")
-````
-
-This code will use the first row in the range A1:C4 as the column names in the
-DataFrame.
-
-To read in data without a header row use
-
-````julia
-df = readxl(DataFrame, "Filename.xlsx", "Sheet1!A1:C4", header=false)
-````
-
-This will auto-generate column names. Alternatively you can specify your own names:
-
-````julia
-df = readxl(DataFrame, "Filename.xlsx", "Sheet1!A1:C4",
-            header=false, colnames=[:name1, :name2, :name3])
-````
-
-You can also combine ``header=true`` and a custom ``colnames`` list, in that
-case the first row in the specified range will just be skipped.
-
-To read the whole sheet into a DataFrame (respective keyword arguments (header, skipstartrows etc.)
-should work as expected):
-
-````julia
-df = readxlsheet(DataFrame, "Filename.xlsx", "Sheet1")
-````
 """
 tutorial = nothing
diff --git a/test/runtests.jl b/test/runtests.jl

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`The ExcelReaders.jl package is licensed under the MIT "Expat" License:`
`2`	`2`
`3`		`-> Copyright (c) 2016: David Anthoff.`
	`3`	`+> Copyright (c) 2016-2018: David Anthoff.`
`4`	`4`	`>`
`5`	`5`	`> Permission is hereby granted, free of charge, to any person obtaining`
`6`	`6`	`> a copy of this software and associated documentation files (the`