[prev in list] [next in list] [prev in thread] [next in thread] 

List:       kde-commits
Subject:    branches/work/sonnet-multilang
From:       Jakub Stachowski <qbast () go2 ! pl>
Date:       2009-03-29 19:40:01
Message-ID: 1238355601.195759.28053.nullmailer () svn ! kde ! org
[Download RAW message or body]

SVN commit 946614 by qbast:

Compile trigrams into map (just like unicode data).
This speeds up loading from 1s (cold cache) or 470ms (hot cache) to 19ms
Whitespace fixes in highlighter



 M  +10 -30    kdecore/guesslanguage.cpp  
 M  +19 -20    kdeui/highlighter.cpp  
 M  +25 -0     unicode/CMakeLists.txt  
 M  +7 -4      unicode/data/CMakeLists.txt  
 A             unicode/parsetrigrams.cpp   [License: LGPL (v2.1+)]


--- branches/work/sonnet-multilang/kdecore/guesslanguage.cpp #946613:946614
@@ -21,6 +21,7 @@
 #include <QtCore/QCoreApplication>
 #include <QtCore/QFile>
 #include <QtCore/QFileInfo>
+#include <QtCore/QTime>
 
 #include "kglobal.h"
 #include "kstandarddirs.h"
@@ -156,38 +157,17 @@
 void GuessLanguagePrivate::load_models()
 {
 
-    QStringList modelsList;
-    {
-        KStandardDirs ksd;
-        ksd.addPrefix( QCoreApplication::applicationDirPath() );
-        modelsList = ksd.findAllResources("data", "sonnet/trigrams/*" );
-        //modelsList = ksd.findAllResources("data", "*.train" );
-    }
+    QTime t;
+    t.start();
+    QString triMapFile =  KStandardDirs::locate( "data", "sonnet/unicode/maps/Trigrams.map" );
 
-    QString modelFile;
+    QFile sin(triMapFile);
+    if ( ! sin.open(QIODevice::ReadOnly) )
+        return;
 
-    foreach (modelFile, modelsList)
-    {
-        QFile file(modelFile);
-        if (!file.open(QIODevice::ReadOnly | QIODevice::Text))
-            kDebug(750) << "Can't read " << modelFile.toLatin1();
-
-        QHash<QString,int> model;
-        QTextStream stream(&file);
-        stream.setCodec("UTF-8");
-        while (!stream.atEnd())
-        {
-            QString line = stream.readLine();
-            QRegExp rx("(.{3})\\s+(.*)");
-            int pos = rx.indexIn(line);
-            if ( pos != -1 )
-                model[line.left(3)] = rx.cap(2).toInt();
-        }
-        QFileInfo fi(modelFile);
-        QString modelName = fi.fileName().toLower();
-        models[modelName] = model;
-    }
-
+    QDataStream in(&sin);
+    in >> models;
+    kDebug() << "Loading took " << t.elapsed() << " ms";
 }
 
 QStringList GuessLanguagePrivate::find_runs(const QString & text)
--- branches/work/sonnet-multilang/kdeui/highlighter.cpp #946613:946614
@@ -273,27 +273,26 @@
         LanguageFilter langs(new SentenceTokenizer(text));
         
         while (langs.hasNext()) {
-        QStringRef sentence=langs.next();
-        if (d->dict->testAttribute(Speller::AutodetectLanguage)) {
-            if (!langs.isSpellcheckable()) continue;
-            d->dict->setLanguage(langs.language());
+            QStringRef sentence=langs.next();
+            if (d->dict->testAttribute(Speller::AutodetectLanguage)) {
+                if (!langs.isSpellcheckable()) continue;
+                d->dict->setLanguage(langs.language());
+            }
+            d->words->setBuffer( sentence.toString() );
+            int offset=sentence.position();
+            while ( d->words->hasNext() ) {
+                QStringRef word=d->words->next();
+                if (!d->words->isSpellcheckable()) continue;
+                ++d->wordCount;
+                if (d->dict->isMisspelled(word.toString())) {
+                    ++d->errorCount;
+                    setMisspelled(word.position()+offset, word.length());
+                    if (d->suggestionListeners)
+                        emit newSuggestions(word.toString(), d->dict->suggest(word.toString()));
+                } else
+                    unsetMisspelled(word.position()+offset, word.length());
+            }
         }
-        d->words->setBuffer( sentence.toString() );
-        int offset=sentence.position();
-//        Word w = d->filter->nextWord();
-        while ( d->words->hasNext() ) {
-            QStringRef word=d->words->next();
-            if (!d->words->isSpellcheckable()) continue;
-            ++d->wordCount;
-            if (d->dict->isMisspelled(word.toString())) {
-                ++d->errorCount;
-                setMisspelled(word.position()+offset, word.length());
-                if (d->suggestionListeners)
-                    emit newSuggestions(word.toString(), d->dict->suggest(word.toString()));
-            } else
-                unsetMisspelled(word.position()+offset, word.length());
-        }
-      }
     }
     //QTimer::singleShot( 0, this, SLOT(checkWords()) );
     setCurrentBlockState(0);
--- branches/work/sonnet-multilang/unicode/CMakeLists.txt #946613:946614
@@ -2,6 +2,10 @@
     parseucd.cpp
 )
 
+SET( parsetrigrams_SRCS
+    parsetrigrams.cpp
+)
+
 include_directories( ${QT_INCLUDES}
                     ${CMAKE_CURRENT_SOURCE_DIR}
                    )
@@ -12,12 +16,18 @@
 set( sonnet_map_dir ${CMAKE_CURRENT_BINARY_DIR}/data)
 
 kde4_add_executable( parseucd ${parseucd_SRCS} )
+kde4_add_executable( parsetrigrams ${parsetrigrams_SRCS} )
 
 TARGET_LINK_LIBRARIES( parseucd
                        ${QT_QTCORE_LIBRARY}
                      )
 
+TARGET_LINK_LIBRARIES( parsetrigrams
+                       ${QT_QTCORE_LIBRARY}
+                     )
+
 INSTALL(TARGETS parseucd DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+INSTALL(TARGETS parsetrigrams DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
 
 MACRO(CREATE_UCD_MAP _target_name _in_FILE _out_FILE)
 
@@ -35,4 +45,19 @@
 
 ENDMACRO(CREATE_UCD_MAP)
 
+MACRO(CREATE_TRI_MAP _target_name _in_DIR _out_FILE)
+
+    GET_TARGET_PROPERTY(PARSETRIGRAMS_EXECUTABLE parsetrigrams LOCATION)
+
+    ADD_CUSTOM_TARGET(TRIGRAMS_${_target_name} ALL)
+
+    ADD_CUSTOM_COMMAND( TARGET TRIGRAMS_${_target_name}
+                        COMMAND ${PARSETRIGRAMS_EXECUTABLE} ${_in_DIR} > ${_out_FILE}
+                      )
+
+    ADD_DEPENDENCIES(TRIGRAMS_${_target_name} parsetrigrams)
+
+ENDMACRO(CREATE_TRI_MAP)
+
+
 add_subdirectory(data)
--- branches/work/sonnet-multilang/unicode/data/CMakeLists.txt #946613:946614
@@ -17,7 +17,13 @@
                 ${CMAKE_SOURCE_DIR}/unicode/data/unicode/ucd/Blocks.txt
                 ${sonnet_map_dir}/Blocks.map
               )
+              
+CREATE_TRI_MAP( Trigrams
+                ${CMAKE_SOURCE_DIR}/unicode/data/trigrams
+                ${sonnet_map_dir}/Trigrams.map
+              )
 
+
 ########### install files ###############
 
 install( FILES
@@ -33,6 +39,7 @@
           ${sonnet_map_dir}/SentenceBreakProperty.map
           ${sonnet_map_dir}/WordBreakProperty.map
           ${sonnet_map_dir}/Blocks.map
+          ${sonnet_map_dir}/Trigrams.map
          DESTINATION ${DATA_INSTALL_DIR}/sonnet/unicode/maps
        )
 
@@ -42,7 +49,6 @@
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/az
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/bg
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/ca
-${CMAKE_SOURCE_DIR}/unicode/data/trigrams/ceb
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/cs
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/cy
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/da
@@ -55,7 +61,6 @@
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/fi
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/fr
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/ha
-${CMAKE_SOURCE_DIR}/unicode/data/trigrams/haw
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/hi
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/hr
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/hu
@@ -73,7 +78,6 @@
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/nl
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/nb
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/nr
-${CMAKE_SOURCE_DIR}/unicode/data/trigrams/nso
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/pl
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/ps
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/pt
@@ -89,7 +93,6 @@
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/sv
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/sw
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/tl
-${CMAKE_SOURCE_DIR}/unicode/data/trigrams/tlh
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/tn
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/tr
 ${CMAKE_SOURCE_DIR}/unicode/data/trigrams/ts
[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic