Skip to content

Commit ccc3dd6

Browse files
committed
format with scalafmt
1 parent ff91733 commit ccc3dd6

File tree

119 files changed

+6716
-3502
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

119 files changed

+6716
-3502
lines changed

spark-csv/src/main/scala/io/frama/parisni/spark/csv/CSVTool.scala

Lines changed: 89 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,29 @@ case class exportDf(file: String, content: String)
1818

1919
object CSVTool extends LazyLogging {
2020

21-
def apply(spark: SparkSession, path: String, schema: StructType, delimiter: Option[String] = None, escape: Option[String] = None, multiline: Option[Boolean] = None, dateFormat: Option[String] = None, timestampFormat: Option[String] = None, removeNullRows: Option[String] = None, isCast: Boolean = true): Dataset[Row] = {
21+
def apply(
22+
spark: SparkSession,
23+
path: String,
24+
schema: StructType,
25+
delimiter: Option[String] = None,
26+
escape: Option[String] = None,
27+
multiline: Option[Boolean] = None,
28+
dateFormat: Option[String] = None,
29+
timestampFormat: Option[String] = None,
30+
removeNullRows: Option[String] = None,
31+
isCast: Boolean = true
32+
): Dataset[Row] = {
2233
val mandatoryColumns = DFTool.getMandatoryColumns(schema)
2334
val optionalColumns = DFTool.getOptionalColumns(schema)
24-
val df = read(spark: SparkSession, path: String, delimiter, escape, multiline, dateFormat, timestampFormat)
35+
val df = read(
36+
spark: SparkSession,
37+
path: String,
38+
delimiter,
39+
escape,
40+
multiline,
41+
dateFormat,
42+
timestampFormat
43+
)
2544

2645
DFTool.existColumns(df, mandatoryColumns)
2746
val dfWithCol = DFTool.addMissingColumns(df, optionalColumns)
@@ -37,8 +56,13 @@ object CSVTool extends LazyLogging {
3756
dfNull
3857
}
3958

40-
def write(df: DataFrame, path: String, mode: org.apache.spark.sql.SaveMode): Unit = {
41-
df.write.format("csv")
59+
def write(
60+
df: DataFrame,
61+
path: String,
62+
mode: org.apache.spark.sql.SaveMode
63+
): Unit = {
64+
df.write
65+
.format("csv")
4266
.option("delimiter", ",")
4367
.option("header", value = false)
4468
.option("nullValue", null)
@@ -51,18 +75,30 @@ object CSVTool extends LazyLogging {
5175
.save(path)
5276
}
5377

54-
def read(spark: SparkSession, path: String, delimiter: Option[String] = None, escape: Option[String], multiline: Option[Boolean] = None, dateFormat: Option[String], timestampFormat: Option[String] = None): Dataset[Row] = {
78+
def read(
79+
spark: SparkSession,
80+
path: String,
81+
delimiter: Option[String] = None,
82+
escape: Option[String],
83+
multiline: Option[Boolean] = None,
84+
dateFormat: Option[String],
85+
timestampFormat: Option[String] = None
86+
): Dataset[Row] = {
5587
val headers = getCsvHeaders(spark, path, delimiter)
5688
val schemaSimple = getStringStructFromArray(headers)
5789
logger.info(schemaSimple.prettyJson)
58-
val csvTmp = spark.read.format("csv")
90+
val csvTmp = spark.read
91+
.format("csv")
5992
.schema(schemaSimple)
6093
.option("multiline", multiline.getOrElse(false))
6194
.option("delimiter", delimiter.getOrElse(","))
6295
.option("header", value = true)
6396
.option("quote", "\"")
6497
.option("escape", escape.getOrElse("\""))
65-
.option("timestampFormat", timestampFormat.getOrElse("yyyy-MM-dd HH:mm:ss"))
98+
.option(
99+
"timestampFormat",
100+
timestampFormat.getOrElse("yyyy-MM-dd HH:mm:ss")
101+
)
66102
.option("dateFormat", dateFormat.getOrElse("yyyy-MM-dd"))
67103
.option("columnNameOfCorruptRecord", "_corrupt_record")
68104
.option("mode", "PERMISSIVE")
@@ -82,7 +118,11 @@ object CSVTool extends LazyLogging {
82118

83119
}
84120

85-
def getCsvHeaders(spark: SparkSession, path: String, delimiter: Option[String]): Array[String] = {
121+
def getCsvHeaders(
122+
spark: SparkSession,
123+
path: String,
124+
delimiter: Option[String]
125+
): Array[String] = {
86126
val data = spark.sparkContext.textFile(path)
87127
val header = data.first()
88128
val headers = header.split(delimiter.getOrElse(","))
@@ -100,50 +140,69 @@ object CSVTool extends LazyLogging {
100140
}
101141

102142
/**
103-
* Exports local files
104-
*
105-
* @param df a dataframe with
106-
* @param fileColumn shall be a string
107-
* @param contentColumn shall be a string
108-
* @param folder
109-
*/
110-
def writeDfToLocalFiles(df: DataFrame, fileColumn: String, contentColumn: String, folder: String) = {
143+
* Exports local files
144+
*
145+
* @param df a dataframe with
146+
* @param fileColumn shall be a string
147+
* @param contentColumn shall be a string
148+
* @param folder
149+
*/
150+
def writeDfToLocalFiles(
151+
df: DataFrame,
152+
fileColumn: String,
153+
contentColumn: String,
154+
folder: String
155+
) = {
111156

112157
// validate folder exists
113158
if (!Files.exists(Paths.get(folder)))
114159
throw new InvalidPathException(folder, folder)
115160

116161
import df.sparkSession.implicits._
117162
val ds = df
118-
.select(col(fileColumn).cast(StringType) as "file", col(contentColumn).cast(StringType) as "content")
163+
.select(
164+
col(fileColumn).cast(StringType) as "file",
165+
col(contentColumn).cast(StringType) as "content"
166+
)
119167
.as[exportDf]
120168

121-
ds.collect().foreach(
122-
p => {
169+
ds.collect()
170+
.foreach(p => {
123171
val fileName = folder + "/" + p.file + ".txt"
124172
val writerAnn = new java.io.PrintWriter(fileName, "UTF-8")
125173
if (p.content != null)
126174
writerAnn.write(p.content)
127175
writerAnn.close()
128-
}
129-
)
176+
})
130177
logger.info(s"Exported ${ds.count} files")
131178

132179
}
133180

134-
def writeCsvLocal(df: DataFrame, tempPath: String, localPath: String, options: Map[String, String] = Map(), format: String = "csv") = {
181+
def writeCsvLocal(
182+
df: DataFrame,
183+
tempPath: String,
184+
localPath: String,
185+
options: Map[String, String] = Map(),
186+
format: String = "csv"
187+
) = {
135188
val hdfs = FileSystem.get(new Configuration())
136189
val hdfsPath = new Path(tempPath)
137190
val targetFile = new File(localPath)
138-
val fileWDot = new File(targetFile.getPath.substring(0, targetFile.getPath.length - targetFile.getName.length) + "." + targetFile.getName)
191+
val fileWDot = new File(
192+
targetFile.getPath.substring(
193+
0,
194+
targetFile.getPath.length - targetFile.getName.length
195+
) + "." + targetFile.getName
196+
)
139197
logger.warn(s"writing to temp file ${fileWDot.getAbsolutePath}")
140198
val mime = format match {
141-
case "csv" => ".csv"
199+
case "csv" => ".csv"
142200
case "text" => ".txt"
143-
case _ => throw new Exception("only text and csv")
201+
case _ => throw new Exception("only text and csv")
144202
}
145203
try {
146-
df.write.mode(SaveMode.Overwrite)
204+
df.write
205+
.mode(SaveMode.Overwrite)
147206
.format(format)
148207
.options(options)
149208
.save(tempPath)
@@ -165,12 +224,13 @@ object CSVTool extends LazyLogging {
165224
outStream.close()
166225
}
167226
}
168-
}
169-
finally {
227+
} finally {
170228
hdfs.delete(hdfsPath, true)
171229
logger.warn(s"deleting hdfs path $hdfsPath")
172230
fileWDot.renameTo(targetFile)
173-
logger.warn(s"renaming ${fileWDot.getAbsolutePath} to ${targetFile.getAbsolutePath}")
231+
logger.warn(
232+
s"renaming ${fileWDot.getAbsolutePath} to ${targetFile.getAbsolutePath}"
233+
)
174234
}
175235
}
176236

spark-csv/src/test/scala/io/frama/parisni/spark/csv/TestCSVTool.scala

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,13 @@ class TestCSVTool extends QueryTest with SparkSessionTestWrapper {
1414
val schema = StructType(
1515
StructField("c1", IntegerType)
1616
:: StructField("c2", IntegerType)
17-
:: StructField("c3", IntegerType, nullable=false, m)
18-
:: Nil)
17+
:: StructField("c3", IntegerType, nullable = false, m)
18+
:: Nil
19+
)
1920

2021
val inputDF = CSVTool(spark, "test1.csv", schema)
2122

22-
val resultDF = spark.sql(
23-
"""
23+
val resultDF = spark.sql("""
2424
select cast(1 as int) as c1, cast(null as int) as c2, cast(123 as int) as c3
2525
union all
2626
select cast(null as int) as c1, cast(1 as int) as c2, cast(123 as int) as c3
@@ -36,13 +36,13 @@ class TestCSVTool extends QueryTest with SparkSessionTestWrapper {
3636
val schema = StructType(
3737
StructField("c1", IntegerType)
3838
:: StructField("c2", IntegerType)
39-
:: StructField("c3", DateType, nullable=false, m)
40-
:: Nil)
39+
:: StructField("c3", DateType, nullable = false, m)
40+
:: Nil
41+
)
4142

4243
val inputDF = CSVTool(spark, "test1.csv", schema)
4344

44-
val resultDF = spark.sql(
45-
"""
45+
val resultDF = spark.sql("""
4646
select cast(1 as int) as c1, cast(null as int) as c2, cast('1515-01-01' as date) as c3
4747
union all
4848
select cast(null as int) as c1, cast(1 as int) as c2, cast('1515-01-01' as date) as c3
@@ -64,8 +64,11 @@ class TestCSVTool extends QueryTest with SparkSessionTestWrapper {
6464
val schema = StructType(
6565
StructField("c1", StringType)
6666
:: StructField("c2", StringType)
67-
:: Nil)
68-
val res = CSVTool.getStringStructFromArray(CSVTool.getCsvHeaders(spark, "test1.csv", Some(",")))
67+
:: Nil
68+
)
69+
val res = CSVTool.getStringStructFromArray(
70+
CSVTool.getCsvHeaders(spark, "test1.csv", Some(","))
71+
)
6972
assert(schema.prettyJson == res.prettyJson)
7073
}
7174

@@ -78,9 +81,15 @@ class TestCSVTool extends QueryTest with SparkSessionTestWrapper {
7881

7982
test("test write file to local") {
8083
import spark.implicits._
81-
val df = ((1, "boby") :: (2, "jim") :: Nil).toDF("file", "content").repartition(2)
84+
val df =
85+
((1, "boby") :: (2, "jim") :: Nil).toDF("file", "content").repartition(2)
8286
val path = Files.createTempDirectory("result")
83-
CSVTool.writeDfToLocalFiles(df, "file", "content", path.toAbsolutePath.toString)
87+
CSVTool.writeDfToLocalFiles(
88+
df,
89+
"file",
90+
"content",
91+
path.toAbsolutePath.toString
92+
)
8493
assert(new File(path.toString).list().length === 2)
8594
}
8695

0 commit comments

Comments
 (0)