diff mbox series

[4/6] tests/acceptance: Introduce tesseract_ocr() helper

Message ID 20201021105035.2477784-5-f4bug@amsat.org
State Accepted
Commit ca8224492854a2930d0cadc76e715bf59582bf66
Headers show
Series tests/acceptance: Test the Fuloong 2E machine | expand

Commit Message

Philippe Mathieu-Daudé Oct. 21, 2020, 10:50 a.m. UTC
We are going to reuse the tesseract OCR code.
Create a new tesseract_ocr() helper and use it.

Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
---
 tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------
 tests/acceptance/tesseract_utils.py       | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+), 16 deletions(-)

Comments

Thomas Huth Oct. 24, 2020, 6:35 a.m. UTC | #1
On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:
> We are going to reuse the tesseract OCR code.

> Create a new tesseract_ocr() helper and use it.

> 

> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>

> ---

>  tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------

>  tests/acceptance/tesseract_utils.py       | 18 ++++++++++++++++++

>  2 files changed, 23 insertions(+), 16 deletions(-)

> 

> diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py

> index 3c7400c43e4..09e2745cc52 100644

> --- a/tests/acceptance/machine_m68k_nextcube.py

> +++ b/tests/acceptance/machine_m68k_nextcube.py

> @@ -7,13 +7,11 @@

>  

>  import os

>  import time

> -import logging

>  

>  from avocado_qemu import Test

>  from avocado import skipUnless

> -from avocado.utils import process

>  

> -from tesseract_utils import tesseract_available

> +from tesseract_utils import tesseract_available, tesseract_ocr

>  

>  PIL_AVAILABLE = True

>  try:

> @@ -61,12 +59,8 @@ def test_bootrom_framebuffer_size(self):

>      def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):

>          screenshot_path = os.path.join(self.workdir, "dump.ppm")

>          self.check_bootrom_framebuffer(screenshot_path)

> -

> -        console_logger = logging.getLogger('console')

> -        text = process.run("tesseract %s stdout" % screenshot_path).stdout_text

> -        for line in text.split('\n'):

> -            if len(line):

> -                console_logger.debug(line)

> +        lines = tesseract_ocr(screenshot_path, tesseract_version=3)

> +        text = '\n'.join(lines)

>          self.assertIn('Backplane', text)

>          self.assertIn('Ethernet address', text)

>  

> @@ -77,13 +71,8 @@ def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):

>      def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):

>          screenshot_path = os.path.join(self.workdir, "dump.ppm")

>          self.check_bootrom_framebuffer(screenshot_path)

> -

> -        console_logger = logging.getLogger('console')

> -        proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)

> -        text = proc.stdout_text

> -        for line in text.split('\n'):

> -            if len(line):

> -                console_logger.debug(line)

> +        lines = tesseract_ocr(screenshot_path, tesseract_version=4)

> +        text = '\n'.join(lines)

>          self.assertIn('Testing the FPU, SCC', text)

>          self.assertIn('System test failed. Error code', text)

>          self.assertIn('Boot command', text)

> diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py

> index acd6e8c2faa..72cd9ab7989 100644

> --- a/tests/acceptance/tesseract_utils.py

> +++ b/tests/acceptance/tesseract_utils.py

> @@ -6,7 +6,9 @@

>  # later. See the COPYING file in the top-level directory.

>  

>  import re

> +import logging

>  

> +from avocado.utils import process

>  from avocado.utils.path import find_command, CmdNotFoundError

>  

>  def tesseract_available(expected_version):

> @@ -26,3 +28,19 @@ def tesseract_available(expected_version):

>          return False

>      # now this is guaranteed to be a digit

>      return int(match.groups()[0]) == expected_version

> +

> +

> +def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):

> +    console_logger = logging.getLogger('tesseract')

> +    console_logger.debug(image_path)

> +    if tesseract_version == 4:

> +        tesseract_args += ' --oem 1'

> +    proc = process.run("tesseract {} {} stdout".format(tesseract_args,

> +                                                       image_path))

> +    lines = []

> +    for line in proc.stdout_text.split('\n'):

> +        sline = line.strip()

> +        if len(sline):

> +            console_logger.debug(sline)

> +            lines += [sline]

> +    return lines


Would it make sense to completely hide the tesseract version handling in
this new tesseract_utils.py file now, so that the tests themselves do not
have to worry about this anymore (i.e. would it be possible to merge
test_bootrom_framebuffer_ocr_with_tesseract_v3 and
test_bootrom_framebuffer_ocr_with_tesseract_v4 into one single test that way?)

 Thomas
Thomas Huth Oct. 24, 2020, 6:40 a.m. UTC | #2
On 24/10/2020 08.35, Thomas Huth wrote:
> On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:

>> We are going to reuse the tesseract OCR code.

>> Create a new tesseract_ocr() helper and use it.

>>

>> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>

>> ---

>>  tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------

>>  tests/acceptance/tesseract_utils.py       | 18 ++++++++++++++++++

>>  2 files changed, 23 insertions(+), 16 deletions(-)

>>

>> diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py

>> index 3c7400c43e4..09e2745cc52 100644

>> --- a/tests/acceptance/machine_m68k_nextcube.py

>> +++ b/tests/acceptance/machine_m68k_nextcube.py

>> @@ -7,13 +7,11 @@

>>  

>>  import os

>>  import time

>> -import logging

>>  

>>  from avocado_qemu import Test

>>  from avocado import skipUnless

>> -from avocado.utils import process

>>  

>> -from tesseract_utils import tesseract_available

>> +from tesseract_utils import tesseract_available, tesseract_ocr

>>  

>>  PIL_AVAILABLE = True

>>  try:

>> @@ -61,12 +59,8 @@ def test_bootrom_framebuffer_size(self):

>>      def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):

>>          screenshot_path = os.path.join(self.workdir, "dump.ppm")

>>          self.check_bootrom_framebuffer(screenshot_path)

>> -

>> -        console_logger = logging.getLogger('console')

>> -        text = process.run("tesseract %s stdout" % screenshot_path).stdout_text

>> -        for line in text.split('\n'):

>> -            if len(line):

>> -                console_logger.debug(line)

>> +        lines = tesseract_ocr(screenshot_path, tesseract_version=3)

>> +        text = '\n'.join(lines)

>>          self.assertIn('Backplane', text)

>>          self.assertIn('Ethernet address', text)

>>  

>> @@ -77,13 +71,8 @@ def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):

>>      def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):

>>          screenshot_path = os.path.join(self.workdir, "dump.ppm")

>>          self.check_bootrom_framebuffer(screenshot_path)

>> -

>> -        console_logger = logging.getLogger('console')

>> -        proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)

>> -        text = proc.stdout_text

>> -        for line in text.split('\n'):

>> -            if len(line):

>> -                console_logger.debug(line)

>> +        lines = tesseract_ocr(screenshot_path, tesseract_version=4)

>> +        text = '\n'.join(lines)

>>          self.assertIn('Testing the FPU, SCC', text)

>>          self.assertIn('System test failed. Error code', text)

>>          self.assertIn('Boot command', text)

>> diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py

>> index acd6e8c2faa..72cd9ab7989 100644

>> --- a/tests/acceptance/tesseract_utils.py

>> +++ b/tests/acceptance/tesseract_utils.py

>> @@ -6,7 +6,9 @@

>>  # later. See the COPYING file in the top-level directory.

>>  

>>  import re

>> +import logging

>>  

>> +from avocado.utils import process

>>  from avocado.utils.path import find_command, CmdNotFoundError

>>  

>>  def tesseract_available(expected_version):

>> @@ -26,3 +28,19 @@ def tesseract_available(expected_version):

>>          return False

>>      # now this is guaranteed to be a digit

>>      return int(match.groups()[0]) == expected_version

>> +

>> +

>> +def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):

>> +    console_logger = logging.getLogger('tesseract')

>> +    console_logger.debug(image_path)

>> +    if tesseract_version == 4:

>> +        tesseract_args += ' --oem 1'

>> +    proc = process.run("tesseract {} {} stdout".format(tesseract_args,

>> +                                                       image_path))

>> +    lines = []

>> +    for line in proc.stdout_text.split('\n'):

>> +        sline = line.strip()

>> +        if len(sline):

>> +            console_logger.debug(sline)

>> +            lines += [sline]

>> +    return lines

> 

> Would it make sense to completely hide the tesseract version handling in

> this new tesseract_utils.py file now, so that the tests themselves do not

> have to worry about this anymore (i.e. would it be possible to merge

> test_bootrom_framebuffer_ocr_with_tesseract_v3 and

> test_bootrom_framebuffer_ocr_with_tesseract_v4 into one single test that way?)


If I've got that right, there is also now a proper release 4 of Tesseract,
so maybe we can simply scratch the testing with version 3 now?

 Thomas
Philippe Mathieu-Daudé Oct. 24, 2020, 7:37 a.m. UTC | #3
On 10/24/20 8:40 AM, Thomas Huth wrote:
> On 24/10/2020 08.35, Thomas Huth wrote:

>> On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:

>>> We are going to reuse the tesseract OCR code.

>>> Create a new tesseract_ocr() helper and use it.

>>>

>>> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>

>>> ---

>>>   tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------

>>>   tests/acceptance/tesseract_utils.py       | 18 ++++++++++++++++++

>>>   2 files changed, 23 insertions(+), 16 deletions(-)

>>>

>>> diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py

>>> index 3c7400c43e4..09e2745cc52 100644

>>> --- a/tests/acceptance/machine_m68k_nextcube.py

>>> +++ b/tests/acceptance/machine_m68k_nextcube.py

>>> @@ -7,13 +7,11 @@

>>>   

>>>   import os

>>>   import time

>>> -import logging

>>>   

>>>   from avocado_qemu import Test

>>>   from avocado import skipUnless

>>> -from avocado.utils import process

>>>   

>>> -from tesseract_utils import tesseract_available

>>> +from tesseract_utils import tesseract_available, tesseract_ocr

>>>   

>>>   PIL_AVAILABLE = True

>>>   try:

>>> @@ -61,12 +59,8 @@ def test_bootrom_framebuffer_size(self):

>>>       def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):

>>>           screenshot_path = os.path.join(self.workdir, "dump.ppm")

>>>           self.check_bootrom_framebuffer(screenshot_path)

>>> -

>>> -        console_logger = logging.getLogger('console')

>>> -        text = process.run("tesseract %s stdout" % screenshot_path).stdout_text

>>> -        for line in text.split('\n'):

>>> -            if len(line):

>>> -                console_logger.debug(line)

>>> +        lines = tesseract_ocr(screenshot_path, tesseract_version=3)

>>> +        text = '\n'.join(lines)

>>>           self.assertIn('Backplane', text)

>>>           self.assertIn('Ethernet address', text)

>>>   

>>> @@ -77,13 +71,8 @@ def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):

>>>       def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):

>>>           screenshot_path = os.path.join(self.workdir, "dump.ppm")

>>>           self.check_bootrom_framebuffer(screenshot_path)

>>> -

>>> -        console_logger = logging.getLogger('console')

>>> -        proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)

>>> -        text = proc.stdout_text

>>> -        for line in text.split('\n'):

>>> -            if len(line):

>>> -                console_logger.debug(line)

>>> +        lines = tesseract_ocr(screenshot_path, tesseract_version=4)

>>> +        text = '\n'.join(lines)

>>>           self.assertIn('Testing the FPU, SCC', text)

>>>           self.assertIn('System test failed. Error code', text)

>>>           self.assertIn('Boot command', text)

>>> diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py

>>> index acd6e8c2faa..72cd9ab7989 100644

>>> --- a/tests/acceptance/tesseract_utils.py

>>> +++ b/tests/acceptance/tesseract_utils.py

>>> @@ -6,7 +6,9 @@

>>>   # later. See the COPYING file in the top-level directory.

>>>   

>>>   import re

>>> +import logging

>>>   

>>> +from avocado.utils import process

>>>   from avocado.utils.path import find_command, CmdNotFoundError

>>>   

>>>   def tesseract_available(expected_version):

>>> @@ -26,3 +28,19 @@ def tesseract_available(expected_version):

>>>           return False

>>>       # now this is guaranteed to be a digit

>>>       return int(match.groups()[0]) == expected_version

>>> +

>>> +

>>> +def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):

>>> +    console_logger = logging.getLogger('tesseract')

>>> +    console_logger.debug(image_path)

>>> +    if tesseract_version == 4:

>>> +        tesseract_args += ' --oem 1'

>>> +    proc = process.run("tesseract {} {} stdout".format(tesseract_args,

>>> +                                                       image_path))

>>> +    lines = []

>>> +    for line in proc.stdout_text.split('\n'):

>>> +        sline = line.strip()

>>> +        if len(sline):

>>> +            console_logger.debug(sline)

>>> +            lines += [sline]

>>> +    return lines

>>

>> Would it make sense to completely hide the tesseract version handling in

>> this new tesseract_utils.py file now, so that the tests themselves do not

>> have to worry about this anymore


Yes, good idea.

> (i.e. would it be possible to merge

>> test_bootrom_framebuffer_ocr_with_tesseract_v3 and

>> test_bootrom_framebuffer_ocr_with_tesseract_v4 into one single test that way?)

> 

> If I've got that right, there is also now a proper release 4 of Tesseract,

> so maybe we can simply scratch the testing with version 3 now?


Good to know, I'll have a look. Thanks!

> 

>   Thomas

>
Philippe Mathieu-Daudé Oct. 24, 2020, 5:40 p.m. UTC | #4
On 10/24/20 9:37 AM, Philippe Mathieu-Daudé wrote:
> On 10/24/20 8:40 AM, Thomas Huth wrote:

>> On 24/10/2020 08.35, Thomas Huth wrote:

>>> On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:

>>>> We are going to reuse the tesseract OCR code.

>>>> Create a new tesseract_ocr() helper and use it.

>>>>

>>>> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>

>>>> ---

>>>>   tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------

>>>>   tests/acceptance/tesseract_utils.py       | 18 ++++++++++++++++++

>>>>   2 files changed, 23 insertions(+), 16 deletions(-)

...

>>>

>>> Would it make sense to completely hide the tesseract version handling in

>>> this new tesseract_utils.py file now, so that the tests themselves do 

>>> not

>>> have to worry about this anymore


The problem is the recognized strings differ between versions,
see in tests/acceptance/machine_m68k_nextcube.py:

         lines = tesseract_ocr(screenshot_path, tesseract_version=3)
         text = '\n'.join(lines)
         self.assertIn('Backplane', text)
         self.assertIn('Ethernet address', text)

and:

         lines = tesseract_ocr(screenshot_path, tesseract_version=4)
         text = '\n'.join(lines)
         self.assertIn('Testing the FPU, SCC', text)
         self.assertIn('System test failed. Error code', text)
         self.assertIn('Boot command', text)
         self.assertIn('Next>', text)

> 

> Yes, good idea.

> 

>> (i.e. would it be possible to merge

>>> test_bootrom_framebuffer_ocr_with_tesseract_v3 and

>>> test_bootrom_framebuffer_ocr_with_tesseract_v4 into one single test 

>>> that way?)

>>

>> If I've got that right, there is also now a proper release 4 of 

>> Tesseract,

>> so maybe we can simply scratch the testing with version 3 now?

> 

> Good to know, I'll have a look. Thanks!

> 

>>

>>   Thomas

>>

>
Thomas Huth Oct. 26, 2020, 6:09 a.m. UTC | #5
On 24/10/2020 19.40, Philippe Mathieu-Daudé wrote:
> On 10/24/20 9:37 AM, Philippe Mathieu-Daudé wrote:

>> On 10/24/20 8:40 AM, Thomas Huth wrote:

>>> On 24/10/2020 08.35, Thomas Huth wrote:

>>>> On 21/10/2020 12.50, Philippe Mathieu-Daudé wrote:

>>>>> We are going to reuse the tesseract OCR code.

>>>>> Create a new tesseract_ocr() helper and use it.

>>>>>

>>>>> Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>

>>>>> ---

>>>>>   tests/acceptance/machine_m68k_nextcube.py | 21 +++++----------------

>>>>>   tests/acceptance/tesseract_utils.py       | 18 ++++++++++++++++++

>>>>>   2 files changed, 23 insertions(+), 16 deletions(-)

> ...

> 

>>>>

>>>> Would it make sense to completely hide the tesseract version handling in

>>>> this new tesseract_utils.py file now, so that the tests themselves do not

>>>> have to worry about this anymore

> 

> The problem is the recognized strings differ between versions,

> see in tests/acceptance/machine_m68k_nextcube.py:

> 

>         lines = tesseract_ocr(screenshot_path, tesseract_version=3)

>         text = '\n'.join(lines)

>         self.assertIn('Backplane', text)

>         self.assertIn('Ethernet address', text)

> 

> and:

> 

>         lines = tesseract_ocr(screenshot_path, tesseract_version=4)

>         text = '\n'.join(lines)

>         self.assertIn('Testing the FPU, SCC', text)

>         self.assertIn('System test failed. Error code', text)

>         self.assertIn('Boot command', text)

>         self.assertIn('Next>', text)


Ah, right, I forgot about that ... well, one more reason to completely
switch to tesseract v4 now ;-)

 Thomas
diff mbox series

Patch

diff --git a/tests/acceptance/machine_m68k_nextcube.py b/tests/acceptance/machine_m68k_nextcube.py
index 3c7400c43e4..09e2745cc52 100644
--- a/tests/acceptance/machine_m68k_nextcube.py
+++ b/tests/acceptance/machine_m68k_nextcube.py
@@ -7,13 +7,11 @@ 
 
 import os
 import time
-import logging
 
 from avocado_qemu import Test
 from avocado import skipUnless
-from avocado.utils import process
 
-from tesseract_utils import tesseract_available
+from tesseract_utils import tesseract_available, tesseract_ocr
 
 PIL_AVAILABLE = True
 try:
@@ -61,12 +59,8 @@  def test_bootrom_framebuffer_size(self):
     def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
         screenshot_path = os.path.join(self.workdir, "dump.ppm")
         self.check_bootrom_framebuffer(screenshot_path)
-
-        console_logger = logging.getLogger('console')
-        text = process.run("tesseract %s stdout" % screenshot_path).stdout_text
-        for line in text.split('\n'):
-            if len(line):
-                console_logger.debug(line)
+        lines = tesseract_ocr(screenshot_path, tesseract_version=3)
+        text = '\n'.join(lines)
         self.assertIn('Backplane', text)
         self.assertIn('Ethernet address', text)
 
@@ -77,13 +71,8 @@  def test_bootrom_framebuffer_ocr_with_tesseract_v3(self):
     def test_bootrom_framebuffer_ocr_with_tesseract_v4(self):
         screenshot_path = os.path.join(self.workdir, "dump.ppm")
         self.check_bootrom_framebuffer(screenshot_path)
-
-        console_logger = logging.getLogger('console')
-        proc = process.run("tesseract --oem 1 %s stdout" % screenshot_path)
-        text = proc.stdout_text
-        for line in text.split('\n'):
-            if len(line):
-                console_logger.debug(line)
+        lines = tesseract_ocr(screenshot_path, tesseract_version=4)
+        text = '\n'.join(lines)
         self.assertIn('Testing the FPU, SCC', text)
         self.assertIn('System test failed. Error code', text)
         self.assertIn('Boot command', text)
diff --git a/tests/acceptance/tesseract_utils.py b/tests/acceptance/tesseract_utils.py
index acd6e8c2faa..72cd9ab7989 100644
--- a/tests/acceptance/tesseract_utils.py
+++ b/tests/acceptance/tesseract_utils.py
@@ -6,7 +6,9 @@ 
 # later. See the COPYING file in the top-level directory.
 
 import re
+import logging
 
+from avocado.utils import process
 from avocado.utils.path import find_command, CmdNotFoundError
 
 def tesseract_available(expected_version):
@@ -26,3 +28,19 @@  def tesseract_available(expected_version):
         return False
     # now this is guaranteed to be a digit
     return int(match.groups()[0]) == expected_version
+
+
+def tesseract_ocr(image_path, tesseract_args='', tesseract_version=3):
+    console_logger = logging.getLogger('tesseract')
+    console_logger.debug(image_path)
+    if tesseract_version == 4:
+        tesseract_args += ' --oem 1'
+    proc = process.run("tesseract {} {} stdout".format(tesseract_args,
+                                                       image_path))
+    lines = []
+    for line in proc.stdout_text.split('\n'):
+        sline = line.strip()
+        if len(sline):
+            console_logger.debug(sline)
+            lines += [sline]
+    return lines