Skip to content

Detect delimiter in CSV rendering #7869

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Aug 15, 2019
Prev Previous commit
Next Next commit
fix failing testcase & use ints where possible
  • Loading branch information
noerw committed Aug 15, 2019
commit 7596c4ba843fb6ae6fe9d7fa89b3b5f934160c08
28 changes: 13 additions & 15 deletions modules/markup/csv/csv.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ import (
"encoding/csv"
"html"
"io"
"math"
"strings"

"code.gitea.io/gitea/modules/markup"
"code.gitea.io/gitea/modules/util"
)

func init() {
Expand Down Expand Up @@ -60,18 +60,18 @@ func (p Parser) Render(rawBytes []byte, urlPrefix string, metas map[string]strin
return tmpBlock.Bytes()
}

// bestDelimiter scores the input CSV data against delimiters, and returns the best match.
// Reads at most 10k bytes & 10 lines.
func (p Parser) bestDelimiter(data []byte) rune {
// Scores the input data against delimiters, and returns the best matching.
// Reads at most 10k bytes & 10 lines.
maxLines := 10
maxBytes := int(math.Min(float64(len(data)), 1e4))
maxBytes := util.Min(len(data), 1e4)
text := string(data[:maxBytes])
lines := strings.SplitN(text, "\n", maxLines+1)[:maxLines]
lines := strings.SplitN(text, "\n", maxLines+1)
lines = lines[:util.Min(maxLines, len(lines))]

delimiters := []rune{',', ';', '\t', '|'}
bestDelim := delimiters[0]
bestScore := 0.0

for _, delim := range delimiters {
score := p.scoreDelimiter(lines, delim)
if score > bestScore {
Expand All @@ -83,28 +83,26 @@ func (p Parser) bestDelimiter(data []byte) rune {
return bestDelim
}

// scoreDelimiter uses a count & regularity metric to evaluate a delimiter against lines of CSV
func (Parser) scoreDelimiter(lines []string, delim rune) (score float64) {
// Scores a delimiter against input csv data with a count and regularity metric.

countTotal := 0.0
countLineMax := 0.0
linesNotEqual := 0.0
countTotal := 0
countLineMax := 0
linesNotEqual := 0

for _, line := range lines {
if len(line) == 0 {
continue
}

countLine := float64(strings.Count(line, string(delim)))
countLine := strings.Count(line, string(delim))
countTotal += countLine

if countLine != countLineMax {
if countLineMax != 0 {
linesNotEqual++
}
countLineMax = math.Max(countLine, countLineMax)
countLineMax = util.Max(countLine, countLineMax)
}
}

return countTotal * (1 - linesNotEqual/float64(len(lines)))
return float64(countTotal) * (1 - float64(linesNotEqual)/float64(len(lines)))
}
2 changes: 1 addition & 1 deletion modules/markup/csv/csv_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ func TestRenderCSV(t *testing.T) {
"1;2": "<table class=\"table\"><tr><td>1</td><td>2</td><tr></table>",
"1\t2": "<table class=\"table\"><tr><td>1</td><td>2</td><tr></table>",
"1|2": "<table class=\"table\"><tr><td>1</td><td>2</td><tr></table>",
"1,2,3;4,5,6;7,8,9\na;b;c": "<table class=\"table\"><tr><td>1,2,3</td><td>4,5,6</td><td>7,8,9</td><tr><tr><td>a</td><td>b</td<td>c</td><tr></table>",
"1,2,3;4,5,6;7,8,9\na;b;c": "<table class=\"table\"><tr><td>1,2,3</td><td>4,5,6</td><td>7,8,9</td><tr><tr><td>a</td><td>b</td><td>c</td><tr></table>",
"<br/>": "<table class=\"table\"><tr><td>&lt;br/&gt;</td><tr></table>",
}

Expand Down