[prev in list] [next in list] [prev in thread] [next in thread] 

List:       freedesktop-poppler
Subject:    Re: [poppler] [PATCH] TextPage::getText in rawOrder mode.
From:       Daniel Garcia Moreno <danigm () yaco ! es>
Date:       2010-05-10 8:37:57
Message-ID: 1273480677.4347.11.camel () localhost ! Stibbons
[Download RAW message or body]

> 
> I filed the bug [1], and attached a patch. I attach the patch in this
> mail too.
> 

I make a new patch that improve the adjustement to x and y limits in
TextPage::getText [1]. I have also a branch in github [2] where I'm
pushing that changes.

Here I attach the two patchs to fix the bug.

[1] https://bugs.freedesktop.org/show_bug.cgi?id=27999
[2] http://github.com/danigm/poppler/tree/gettext

["0001-TextData-getText-return-text-in-rawOrder.patch" (0001-TextData-getText-return-text-in-rawOrder.patch)]

From 50f6ae59d101fb60bd61e8b8063eac10121074da Mon Sep 17 00:00:00 2001
From: danigm <dani@danigm.net>
Date: Thu, 6 May 2010 23:52:04 +0200
Subject: [PATCH 1/2] TextData::getText return text in rawOrder

---
 poppler/TextOutputDev.cc |   30 +++++++++++++++++++---
 test/CMakeLists.txt      |    7 +++-
 test/gettext-test.cc     |   63 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+), 6 deletions(-)
 create mode 100644 test/gettext-test.cc

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index ef9c486..28b864b 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -3602,10 +3602,6 @@ GooString *TextPage::getText(double xMin, double yMin,
 
   s = new GooString();
 
-  if (rawOrder) {
-    return s;
-  }
-
   // get the output encoding
   if (!(uMap = globalParams->getTextEncoding())) {
     return s;
@@ -3626,6 +3622,32 @@ GooString *TextPage::getText(double xMin, double yMin,
     break;
   }
 
+  if (rawOrder) {
+    TextWordList *wordlist;
+    wordlist = makeWordList(gFalse);
+    int word_length = wordlist->getLength ();
+    TextWord *word;
+    double xMinA, yMinA, xMaxA, yMaxA;
+
+    for (int i=0; i < word_length; i++)
+    {
+      word = wordlist->get (i);
+      word->getBBox (&xMinA, &yMinA, &xMaxA, &yMaxA);
+      if (xMinA > xMin && yMinA > yMin && xMaxA < xMax && yMaxA < yMax)
+        s->append (word->getText ());
+      else
+        continue;
+      if (word->getNext() && word->getNext()->primaryDelta (word) <= 0)
+      {
+	s->append(space, spaceLen);
+      } else {
+	s->append(eol, eolLen);
+      }
+    }
+    return s;
+  }
+
+
   //~ writing mode (horiz/vert)
 
   // collect the line fragments that are in the rectangle
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index acb867b..fadcd45 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -1,4 +1,3 @@
-
 if (ENABLE_SPLASH)
 
   if (HAVE_NANOSLEEP OR LIB_RT_HAS_NANOSLEEP)
@@ -58,4 +57,8 @@ set (pdf_fullrewrite_SRCS
 add_executable(pdf-fullrewrite ${pdf_fullrewrite_SRCS})
 target_link_libraries(pdf-fullrewrite poppler)
 
-
+set (gettext_SRCS
+    gettext-test.cc
+    )
+add_executable(gettext-test ${gettext_SRCS})
+target_link_libraries(gettext-test poppler)
diff --git a/test/gettext-test.cc b/test/gettext-test.cc
new file mode 100644
index 0000000..0c32a9e
--- /dev/null
+++ b/test/gettext-test.cc
@@ -0,0 +1,63 @@
+#include "config.h"
+#include "Page.h"
+#include <poppler-config.h>
+#include "GlobalParams.h"
+#include "Error.h"
+#include "PDFDoc.h"
+#include "goo/GooString.h"
+#include "TextOutputDev.h"
+
+int main (int argc, char *argv[])
+{
+  PDFDoc *doc;
+  GooString *inputName;
+  GooString *s;
+  char *result;
+  int page_index;
+  TextOutputDev *textOut;
+  Page *page;
+  PDFRectangle *rect;
+
+  // parse args
+  if (argc < 3) {
+    fprintf(stderr, "usage: %s INPUT-FILE page\n", argv[0]);
+    return 1;
+  }
+  if (!sscanf (argv[2], "%d", &page_index))
+  {
+    fprintf(stderr, "usage: %s INPUT-FILE page\n", argv[0]);
+    return 1;
+  }
+
+  inputName = new GooString(argv[1]);
+
+  globalParams = new GlobalParams();
+
+  doc = new PDFDoc(inputName);
+
+  if (!doc->isOk()) {
+    delete doc;
+    fprintf(stderr, "Error loading document !\n");
+    return 1;
+  }
+
+  page = doc->getCatalog()->getPage(1);
+
+  //textOut = new TextOutputDev(0, gFalse, gFalse, gFalse);
+  textOut = new TextOutputDev(0, gTrue, gTrue, gFalse);
+  doc->displayPageSlice(textOut, page_index, 72, 72,
+      0, false, true, false, -1, -1, -1, -1);
+
+  rect = page->getCropBox();
+  s = textOut->getText(rect->x1, rect->y1, rect->x2, rect->y2);
+
+  result = s->getCString ();
+  printf ("%s\n", result);
+
+  delete textOut;
+  delete s;
+
+  delete doc;
+  delete globalParams;
+  return 0;
+}
-- 
1.7.1


["0002-TextData-getText-in-rawOrder-now-count-chars.patch" (0002-TextData-getText-in-rawOrder-now-count-chars.patch)]

From accb938021cf2bd0f7ae37546c601623f5dc1f1b Mon Sep 17 00:00:00 2001
From: danigm <dani@danigm.net>
Date: Mon, 10 May 2010 10:14:57 +0200
Subject: [PATCH 2/2] TextData::getText in rawOrder now count chars

The previous getText with rawOrder only looked if words are between
limits. This commit adds chars of a word which are in.
---
 poppler/TextOutputDev.cc |   43 ++++++++++++++++++++++++++++++++-----------
 test/gettext-test.cc     |    1 +
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/poppler/TextOutputDev.cc b/poppler/TextOutputDev.cc
index 28b864b..4c42b30 100644
--- a/poppler/TextOutputDev.cc
+++ b/poppler/TextOutputDev.cc
@@ -3626,22 +3626,43 @@ GooString *TextPage::getText(double xMin, double yMin,
     TextWordList *wordlist;
     wordlist = makeWordList(gFalse);
     int word_length = wordlist->getLength ();
-    TextWord *word;
+    TextWord *word=NULL, *prev_word=NULL;
+    const Unicode *word_char;
+    char buf[8];
+    bool outOfBound = false;
     double xMinA, yMinA, xMaxA, yMaxA;
 
-    for (int i=0; i < word_length; i++)
-    {
+    for (int i=0; i < word_length; i++) {
       word = wordlist->get (i);
+
+      if (prev_word && word->primaryDelta (prev_word) <= 0) {
+        if (!outOfBound)
+            s->append(space, spaceLen);
+      } else {
+        s->append(eol, eolLen);
+      }
+
       word->getBBox (&xMinA, &yMinA, &xMaxA, &yMaxA);
-      if (xMinA > xMin && yMinA > yMin && xMaxA < xMax && yMaxA < yMax)
+      if (xMinA > xMin && yMinA > yMin && xMaxA < xMax && yMaxA < yMax) {
         s->append (word->getText ());
-      else
-        continue;
-      if (word->getNext() && word->getNext()->primaryDelta (word) <= 0)
-      {
-	s->append(space, spaceLen);
-      } else {
-	s->append(eol, eolLen);
+        prev_word = word;
+        outOfBound = false;
+      }
+      else if (xMinA < xMax && yMinA < yMax) {
+        for (int i=0; i < word->getLength(); i++) {
+          int n;
+          word->getCharBBox(i, &xMinA, &yMinA, &xMaxA, &yMaxA);
+          if (xMinA > xMin && yMinA > yMin && xMaxA < xMax && yMaxA < yMax) {
+            word_char = word->getChar(i);
+            n = uMap->mapUnicode(*word_char, buf, sizeof(buf));
+            s->append(buf, n);
+          }
+        }
+        prev_word = word;
+        outOfBound = true;
+      }
+      else {
+        outOfBound = true;
       }
     }
     return s;
diff --git a/test/gettext-test.cc b/test/gettext-test.cc
index 0c32a9e..58f07a9 100644
--- a/test/gettext-test.cc
+++ b/test/gettext-test.cc
@@ -50,6 +50,7 @@ int main (int argc, char *argv[])
 
   rect = page->getCropBox();
   s = textOut->getText(rect->x1, rect->y1, rect->x2, rect->y2);
+  //s = textOut->getText(0, 0, 200, 1000);
 
   result = s->getCString ();
   printf ("%s\n", result);
-- 
1.7.1



_______________________________________________
poppler mailing list
poppler@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/poppler


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic