Skip to content

Commit f50e774

Browse files
authored
Remove nokogiri (#72)
* Remove nokogiri dependency * Update Gemfile and bump version * Update methods * Update rubocop version on hound and a little refactoring.
1 parent c68edcd commit f50e774

23 files changed

+111
-57
lines changed

.hound.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
rubocop:
2+
config_file: .rubocop.yml
3+
version: 0.80.0

.rubocop.yml

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,20 @@
1-
Documentation:
2-
Enabled: false
31

4-
Metrics/LineLength:
2+
Layout/LineLength:
53
Max: 150
64

75
Metrics/BlockLength:
86
Max: 50
7+
8+
Metrics/AbcSize:
9+
Max: 30
10+
11+
Style/Documentation:
12+
Enabled: false
13+
Style/HashEachMethods:
14+
Enabled: true
15+
16+
Style/HashTransformKeys:
17+
Enabled: true
18+
19+
Style/HashTransformValues:
20+
Enabled: true

Gemfile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
source 'https://rubygems.org'
24

35
git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }

Gemfile.lock

Lines changed: 19 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,45 @@
11
PATH
22
remote: .
33
specs:
4-
rtesseract (3.0.5)
5-
nokogiri
4+
rtesseract (3.1.0)
65

76
GEM
87
remote: https://rubygems.org/
98
specs:
10-
coveralls (0.7.2)
11-
multi_json (~> 1.3)
12-
rest-client (= 1.6.7)
13-
simplecov (>= 0.7)
14-
term-ansicolor (= 1.2.2)
15-
thor (= 0.18.1)
9+
coveralls (0.8.23)
10+
json (>= 1.8, < 3)
11+
simplecov (~> 0.16.1)
12+
term-ansicolor (~> 1.3)
13+
thor (>= 0.19.4, < 2.0)
14+
tins (~> 1.6)
1615
diff-lcs (1.3)
1716
docile (1.3.2)
18-
mime-types (3.3.1)
19-
mime-types-data (~> 3.2015)
20-
mime-types-data (3.2019.1009)
21-
mini_portile2 (2.4.0)
22-
multi_json (1.14.1)
23-
nokogiri (1.10.9)
24-
mini_portile2 (~> 2.4.0)
17+
json (2.3.0)
2518
rake (13.0.1)
26-
rest-client (1.6.7)
27-
mime-types (>= 1.16)
2819
rspec (3.9.0)
2920
rspec-core (~> 3.9.0)
3021
rspec-expectations (~> 3.9.0)
3122
rspec-mocks (~> 3.9.0)
3223
rspec-core (3.9.1)
3324
rspec-support (~> 3.9.1)
34-
rspec-expectations (3.9.0)
25+
rspec-expectations (3.9.1)
3526
diff-lcs (>= 1.2.0, < 2.0)
3627
rspec-support (~> 3.9.0)
3728
rspec-mocks (3.9.1)
3829
diff-lcs (>= 1.2.0, < 2.0)
3930
rspec-support (~> 3.9.0)
4031
rspec-support (3.9.2)
41-
simplecov (0.18.5)
32+
simplecov (0.16.1)
4233
docile (~> 1.1)
43-
simplecov-html (~> 0.11)
44-
simplecov-html (0.12.2)
45-
term-ansicolor (1.2.2)
46-
tins (~> 0.8)
47-
thor (0.18.1)
48-
tins (0.13.2)
34+
json (>= 1.8, < 3)
35+
simplecov-html (~> 0.10.0)
36+
simplecov-html (0.10.2)
37+
sync (0.5.0)
38+
term-ansicolor (1.7.1)
39+
tins (~> 1.0)
40+
thor (1.0.1)
41+
tins (1.24.1)
42+
sync
4943

5044
PLATFORMS
5145
ruby
@@ -56,7 +50,6 @@ DEPENDENCIES
5650
rake
5751
rspec
5852
rtesseract!
59-
simplecov
6053

6154
BUNDLED WITH
6255
2.1.4

Rakefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
require 'bundler/gem_tasks'
24
require 'rspec/core/rake_task'
35

bin/console

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env ruby
2+
# frozen_string_literal: true
23

34
require 'bundler/setup'
45
require 'rtesseract'

lib/rtesseract.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
require 'rtesseract/check'
24
require 'rtesseract/configuration'
35
require 'rtesseract/command'

lib/rtesseract/base.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
require 'tmpdir'
24
require 'securerandom'
35
require 'pathname'

lib/rtesseract/box.rb

Lines changed: 32 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,45 @@
1-
require 'nokogiri'
1+
# frozen_string_literal: true
22

33
class RTesseract
44
module Box
55
extend RTesseract::Base
66

7-
def self.run(source, errors, options)
8-
options.tessedit_create_hocr = 1
7+
class << self
8+
def run(source, errors, options)
9+
options.tessedit_create_hocr = 1
910

10-
RTesseract::Command.new(source, temp_file, errors, options).run
11+
RTesseract::Command.new(source, temp_file, errors, options).run
1112

12-
parse(File.read(temp_file('.hocr')))
13-
end
13+
parse(File.read(temp_file('.hocr')))
14+
end
1415

15-
def self.parse(content)
16-
html = Nokogiri::HTML(content)
17-
html.css('span.ocrx_word, span.ocr_word').map do |word|
18-
attributes = word.attributes['title'].value.to_s.delete(';').split(' ')
19-
word_info(word, attributes)
16+
def parse(content)
17+
content.lines.map { |line| parse_line(line) }.compact
2018
end
21-
end
2219

23-
def self.word_info(word, data)
24-
{
25-
word: word.text,
26-
x_start: data[1].to_i,
27-
y_start: data[2].to_i,
28-
x_end: data[3].to_i,
29-
y_end: data[4].to_i
30-
}
20+
def parse_line(line)
21+
return unless line.match?(/oc(rx|r)_word/)
22+
23+
word = line.match(/(?<=>)(.*?)(?=<)/).to_s
24+
25+
return if word.strip == ''
26+
27+
word_info(word, parse_position(line))
28+
end
29+
30+
def word_info(word, positions)
31+
{
32+
word: word,
33+
x_start: positions[1].to_i,
34+
y_start: positions[2].to_i,
35+
x_end: positions[3].to_i,
36+
y_end: positions[4].to_i
37+
}
38+
end
39+
40+
def parse_position(line)
41+
line.match(/(?<=title)(.*?)(?=;)/).to_s.split(' ')
42+
end
3143
end
3244
end
3345
end

lib/rtesseract/check.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
class RTesseract
24
class << self
35
def tesseract_version

lib/rtesseract/command.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
class RTesseract
24
class Command
35
FIXED = %i[command psm oem lang tessdata_dir user_words user_patterns config_file].freeze

lib/rtesseract/configuration.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
require 'ostruct'
24

35
class RTesseract

lib/rtesseract/pdf.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
class RTesseract
24
module Pdf
35
extend Base

lib/rtesseract/text.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
require 'open3'
24

35
class RTesseract

lib/rtesseract/tsv.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
class RTesseract
24
module Tsv
35
extend Base

lib/rtesseract/version.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
class RTesseract
2-
VERSION = '3.0.5'.freeze
4+
VERSION = '3.1.0'
35
end

rtesseract.gemspec

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
lib = File.expand_path('lib', __dir__)
24
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
35
require 'rtesseract/version'
@@ -8,9 +10,9 @@ Gem::Specification.new do |spec|
810
spec.authors = ['Danilo Jeremias da Silva']
911
spec.email = ['[email protected]']
1012

11-
spec.summary = 'Ruby library for working with the Tesseract OCR.'.freeze
12-
spec.description = 'Ruby library for working with the Tesseract OCR.'.freeze
13-
spec.homepage = 'http://github.com/dannnylo/rtesseract'.freeze
13+
spec.summary = 'Ruby library for working with the Tesseract OCR.'
14+
spec.description = 'Ruby library for working with the Tesseract OCR.'
15+
spec.homepage = 'http://github.com/dannnylo/rtesseract'
1416
spec.license = 'MIT'
1517

1618
# Specify which files should be added to the gem when it is released.
@@ -26,7 +28,4 @@ Gem::Specification.new do |spec|
2628
spec.add_development_dependency 'coveralls'
2729
spec.add_development_dependency 'rake'
2830
spec.add_development_dependency 'rspec'
29-
spec.add_development_dependency 'simplecov'
30-
31-
spec.add_dependency 'nokogiri'
3231
end

spec/rtesseract/box_spec.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
RSpec.describe RTesseract::Box do
24
let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') }
35
let(:words_image) { path.join('resources', 'test_words.png').to_s }

spec/rtesseract/configuration_spec.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
RSpec.describe RTesseract do
24
let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') }
35

spec/rtesseract/pdf_spec.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
RSpec.describe RTesseract::Pdf do
24
let(:path) { Pathname.new(File.dirname(__FILE__)).join('..') }
35

spec/rtesseract/tsv_spec.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
require 'csv'
24

35
RSpec.describe RTesseract::Tsv do

spec/rtesseract_spec.rb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
RSpec.describe RTesseract do
24
let(:path) { Pathname.new(__dir__) }
35
let(:image_path) { path.join('resources', 'test.tif').to_s }
@@ -52,7 +54,7 @@
5254
it 'store the error on a variable to debug' do
5355
instance = RTesseract.new
5456
expect { instance.to_s }.to raise_error(RTesseract::Error)
55-
expect(instance.errors.first).to include("Error during processing")
57+
expect(instance.errors.first).to include('Error during processing')
5658

5759
error_intance = RTesseract.new(path.join('resources', 'image_with_error.png').to_s)
5860

spec/spec_helper.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# frozen_string_literal: true
2+
13
require 'bundler/setup'
24
require 'coveralls'
35
require 'simplecov'

0 commit comments

Comments
 (0)