[prev in list] [next in list] [prev in thread] [next in thread]
List: kde-commits
Subject: branches/work/sonnet-multilang
From: Jakub Stachowski <qbast () go2 ! pl>
Date: 2009-03-29 19:40:01
Message-ID: 1238355601.195759.28053.nullmailer () svn ! kde ! org
[Download RAW message or body]
SVN commit 946614 by qbast:
Compile trigrams into map (just like unicode data).
This speeds up loading from 1s (cold cache) or 470ms (hot cache) to 19ms
Whitespace fixes in highlighter
M +10 -30 kdecore/guesslanguage.cpp
M +19 -20 kdeui/highlighter.cpp
M +25 -0 unicode/CMakeLists.txt
M +7 -4 unicode/data/CMakeLists.txt
A unicode/parsetrigrams.cpp [License: LGPL (v2.1+)]
--- branches/work/sonnet-multilang/kdecore/guesslanguage.cpp #946613:946614
@@ -21,6 +21,7 @@
#include <QtCore/QCoreApplication>
#include <QtCore/QFile>
#include <QtCore/QFileInfo>
+#include <QtCore/QTime>
#include "kglobal.h"
#include "kstandarddirs.h"
@@ -156,38 +157,17 @@
void GuessLanguagePrivate::load_models()
{
- QStringList modelsList;
- {
- KStandardDirs ksd;
- ksd.addPrefix( QCoreApplication::applicationDirPath() );
- modelsList = ksd.findAllResources("data", "sonnet/trigrams/*" );
- //modelsList = ksd.findAllResources("data", "*.train" );
- }
+ QTime t;
+ t.start();
+ QString triMapFile = KStandardDirs::locate( "data", "sonnet/unicode/maps/Trigrams.map" );
- QString modelFile;
+ QFile sin(triMapFile);
+ if ( ! sin.open(QIODevice::ReadOnly) )
+ return;
- foreach (modelFile, modelsList)
- {
- QFile file(modelFile);
- if (!file.open(QIODevice::ReadOnly | QIODevice::Text))
- kDebug(750) << "Can't read " << modelFile.toLatin1();
-
- QHash<QString,int> model;
- QTextStream stream(&file);
- stream.setCodec("UTF-8");
- while (!stream.atEnd())
- {
- QString line = stream.readLine();
- QRegExp rx("(.{3})\\s+(.*)");
- int pos = rx.indexIn(line);
- if ( pos != -1 )
- model[line.left(3)] = rx.cap(2).toInt();
- }
- QFileInfo fi(modelFile);
- QString modelName = fi.fileName().toLower();
- models[modelName] = model;
- }
-
+ QDataStream in(&sin);
+ in >> models;
+ kDebug() << "Loading took " << t.elapsed() << " ms";
}
QStringList GuessLanguagePrivate::find_runs(const QString & text)
--- branches/work/sonnet-multilang/kdeui/highlighter.cpp #946613:946614
@@ -273,27 +273,26 @@
LanguageFilter langs(new SentenceTokenizer(text));
while (langs.hasNext()) {
- QStringRef sentence=langs.next();
- if (d->dict->testAttribute(Speller::AutodetectLanguage)) {
- if (!langs.isSpellcheckable()) continue;
- d->dict->setLanguage(langs.language());
+ QStringRef sentence=langs.next();
+ if (d->dict->testAttribute(Speller::AutodetectLanguage)) {
+ if (!langs.isSpellcheckable()) continue;
+ d->dict->setLanguage(langs.language());
+ }
+ d->words->setBuffer( sentence.toString() );
+ int offset=sentence.position();
+ while ( d->words->hasNext() ) {
+ QStringRef word=d->words->next();
+ if (!d->words->isSpellcheckable()) continue;
+ ++d->wordCount;
+ if (d->dict->isMisspelled(word.toString())) {
+ ++d->errorCount;
+ setMisspelled(word.position()+offset, word.length());
+ if (d->suggestionListeners)
+ emit newSuggestions(word.toString(), d->dict->suggest(word.toString()));
+ } else
+ unsetMisspelled(word.position()+offset, word.length());
+ }
}
- d->words->setBuffer( sentence.toString() );
- int offset=sentence.position();
-// Word w = d->filter->nextWord();
- while ( d->words->hasNext() ) {
- QStringRef word=d->words->next();
- if (!d->words->isSpellcheckable()) continue;
- ++d->wordCount;
- if (d->dict->isMisspelled(word.toString())) {
- ++d->errorCount;
- setMisspelled(word.position()+offset, word.length());
- if (d->suggestionListeners)
- emit newSuggestions(word.toString(), d->dict->suggest(word.toString()));
- } else
- unsetMisspelled(word.position()+offset, word.length());
- }
- }
}
//QTimer::singleShot( 0, this, SLOT(checkWords()) );
setCurrentBlockState(0);
--- branches/work/sonnet-multilang/unicode/CMakeLists.txt #946613:946614
@@ -2,6 +2,10 @@
parseucd.cpp
)
+SET( parsetrigrams_SRCS
+ parsetrigrams.cpp
+)
+
include_directories( ${QT_INCLUDES}
${CMAKE_CURRENT_SOURCE_DIR}
)
@@ -12,12 +16,18 @@
set( sonnet_map_dir ${CMAKE_CURRENT_BINARY_DIR}/data)
kde4_add_executable( parseucd ${parseucd_SRCS} )
+kde4_add_executable( parsetrigrams ${parsetrigrams_SRCS} )
TARGET_LINK_LIBRARIES( parseucd
${QT_QTCORE_LIBRARY}
)
+TARGET_LINK_LIBRARIES( parsetrigrams
+ ${QT_QTCORE_LIBRARY}
+ )
+
INSTALL(TARGETS parseucd DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
+INSTALL(TARGETS parsetrigrams DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
MACRO(CREATE_UCD_MAP _target_name _in_FILE _out_FILE)
@@ -35,4 +45,19 @@
ENDMACRO(CREATE_UCD_MAP)
+MACRO(CREATE_TRI_MAP _target_name _in_DIR _out_FILE)
+
+ GET_TARGET_PROPERTY(PARSETRIGRAMS_EXECUTABLE parsetrigrams LOCATION)
+
+ ADD_CUSTOM_TARGET(TRIGRAMS_${_target_name} ALL)
+
+ ADD_CUSTOM_COMMAND( TARGET TRIGRAMS_${_target_name}
+ COMMAND ${PARSETRIGRAMS_EXECUTABLE} ${_in_DIR} > ${_out_FILE}
+ )
+
+ ADD_DEPENDENCIES(TRIGRAMS_${_target_name} parsetrigrams)
+
+ENDMACRO(CREATE_TRI_MAP)
+
+
add_subdirectory(data)
--- branches/work/sonnet-multilang/unicode/data/CMakeLists.txt #946613:946614
@@ -17,7 +17,13 @@
${CMAKE_SOURCE_DIR}/unicode/data/unicode/ucd/Blocks.txt
${sonnet_map_dir}/Blocks.map
)
+
+CREATE_TRI_MAP( Trigrams
+ ${CMAKE_SOURCE_DIR}/unicode/data/trigrams
+ ${sonnet_map_dir}/Trigrams.map
+ )
+
########### install files ###############
install( FILES
@@ -33,6 +39,7 @@
${sonnet_map_dir}/SentenceBreakProperty.map
${sonnet_map_dir}/WordBreakProperty.map
${sonnet_map_dir}/Blocks.map
+ ${sonnet_map_dir}/Trigrams.map
DESTINATION ${DATA_INSTALL_DIR}/sonnet/unicode/maps
)
@@ -42,7 +49,6 @@
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/az
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/bg
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/ca
-${CMAKE_SOURCE_DIR}/unicode/data/trigrams/ceb
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/cs
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/cy
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/da
@@ -55,7 +61,6 @@
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/fi
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/fr
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/ha
-${CMAKE_SOURCE_DIR}/unicode/data/trigrams/haw
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/hi
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/hr
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/hu
@@ -73,7 +78,6 @@
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/nl
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/nb
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/nr
-${CMAKE_SOURCE_DIR}/unicode/data/trigrams/nso
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/pl
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/ps
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/pt
@@ -89,7 +93,6 @@
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/sv
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/sw
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/tl
-${CMAKE_SOURCE_DIR}/unicode/data/trigrams/tlh
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/tn
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/tr
${CMAKE_SOURCE_DIR}/unicode/data/trigrams/ts
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic