[prev in list] [next in list] [prev in thread] [next in thread]
List: wekalist
Subject: [Wekalist] LibSVMLoader for huge datasets
From: Victor-Philipp Negoescu <victor () viathinksoft ! de>
Date: 2013-12-20 1:26:39
Message-ID: 1010042075.20131220022639 () viathinksoft ! de
[Download RAW message or body]
Hello dear Weka users & admins,
the LibSVMLoader class (method getStructure()) reads the input file
char-by-char which is quite slow and inefficient.
I improved the method as it follows. You are free to update Weka by
integrating my code.
==========>8==========>8==========>8==========>8==========>8=======
public Instances getStructure() throws IOException {
String line;
int cInt;
char c;
int numAtt;
FastVector atts;
int i;
String relName;
if (m_sourceReader == null)
throw new IOException("No source has been specified");
if (m_structure == null) {
m_Buffer = new Vector();
try {
// determine number of attributes
numAtt = 0;
int len = 1024 * 1024 * 8; // 8 MB
char[] cbuf = new char[len];
int iter = 0;
String linesplitter = null;
String[] lines;
String oldLine = null;
String read = null;
while ((cInt = m_sourceReader.read(cbuf, 0, len)) != -1) {
iter++;
if (iter % 10 == 0) {
System.out.println("Read iteration #" + iter + " (" + \
(len/1024/1024*(iter-1)) + " MB read)"); }
read = String.valueOf(cbuf, 0, cInt);
if (oldLine != null) {
read = oldLine + read;
}
if (linesplitter == null) {
if (read.contains("\r\n"))
linesplitter = "\r\n";
else if (read.contains("\n"))
linesplitter = "\n";
}
if (linesplitter != null) {
lines = read.split(linesplitter, -1);
} else {
lines = new String[]{read};
}
for (int j = 0; j < lines.length-1; j++) {
line = lines[j];
m_Buffer.add(libsvmToArray(line));
numAtt = determineNumAttributes(line, numAtt);
}
oldLine = lines[lines.length-1];
}
// last line?
if (oldLine != null && oldLine.length() != 0) {
m_Buffer.add(libsvmToArray(oldLine));
numAtt = determineNumAttributes(oldLine, numAtt);
}
// generate header
atts = new FastVector(numAtt);
for (i = 0; i < numAtt - 1; i++)
atts.addElement(new Attribute("att_" + (i+1)));
atts.addElement(new Attribute("class"));
if (!m_URL.equals("http://"))
relName = m_URL;
else
relName = m_File;
m_structure = new Instances(relName, atts, 0);
m_structure.setClassIndex(m_structure.numAttributes() - 1);
}
catch (Exception ex) {
ex.printStackTrace();
throw new IOException("Unable to determine structure as libsvm: " + ex);
}
}
return new Instances(m_structure, 0);
}
==========8<==========8<==========8<==========8<==========8<=======
Regards,
Victor-Philipp Negoescu, ViaThinkSoft
http://www.viathinksoft.com
_______________________________________________
Wekalist mailing list
Send posts to: Wekalist@list.waikato.ac.nz
List info and subscription status: \
http://list.waikato.ac.nz/mailman/listinfo/wekalist List etiquette: \
http://www.cs.waikato.ac.nz/~ml/weka/mailinglist_etiquette.html
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic