[prev in list] [next in list] [prev in thread] [next in thread]
List: jakarta-commons-dev
Subject: cvs commit: jakarta-commons-sandbox/feedparser/tests/anchor anchor1.html anchor2.html anchor3.html a
From: burton () apache ! org
Date: 2004-08-31 21:02:40
Message-ID: 20040831210240.48761.qmail () minotaur ! apache ! org
[Download RAW message or body]
burton 2004/08/31 14:02:40
Added: feedparser/src/java/org/apache/commons/feedparser
FeedFilter.java
feedparser/tests/anchor anchor1.html anchor2.html
anchor3.html anchor4.html
Log:
filter and tests supports
Revision Changes Path
1.1 \
jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedFilter.java
Index: FeedFilter.java
===================================================================
/*
* Copyright 1999,2004 The Apache Software Foundation.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.feedparser;
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
/**
*
* @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
* @version $Id: FeedFilter.java,v 1.1 2004/08/31 21:02:40 burton Exp $
*/
public class FeedFilter {
public static HashMap LATIN1_ENTITIES = new HashMap();
private static Pattern entity_pattern = Pattern.compile( "&([a-zA-Z]+);" );
public static byte[] parse( byte[] bytes ) {
String content = new String( bytes );
//remove leading prolog...
content = doRemoveLeadingProlog( content );
content = doDecodeEntities( content );
return content.getBytes();
}
/**
* Removing prolog whitespace, comments, and other garbage from the
* beginning of a feed.
*
* @author <a href="mailto:burton@rojo.com">Kevin A. Burton</a>
*/
private static String doRemoveLeadingProlog( String content ) {
//move to the beginning of the first element or comment. When this is a
//processing instruction we will move to that
int begin = content.indexOf( "<" );
if ( begin > 0 )
content = content.substring( begin, content.length() );
//now skip to the XML processing instruction when necessary.
begin = content.indexOf( "<?xml" );
if ( begin > 0 )
content = content.substring( begin, content.length() );
return content;
}
private static String doDecodeEntities( String content ) {
StringBuffer buff = new StringBuffer( content.length() + 1000 );
Matcher m = entity_pattern.matcher( content );
int begin = 0;
while ( m.find() ) {
buff.append( content.substring( begin, m.start() ) );
String entity = m.group( 1 );
String value = (String)LATIN1_ENTITIES.get( entity );
if ( value != null ) {
buff.append( "&#" );
buff.append( value );
buff.append( ";" );
} else {
buff.append( "&" );
buff.append( entity );
buff.append( ";" );
}
begin = m.end( 0 );
}
buff.append( content.substring( begin, content.length() ) );
return buff.toString();
}
public static void main( String[] args ) {
byte[] b = parse( "hello é world".getBytes() );
String v = new String( b );
System.out.println( "v: " + v );
}
static {
// load the latin1 entity map. We will replace latin1 entities with
// their char references directly. For example if someone incorrectly
// references:
//
// ä
//
// we replace it with:
//
// ä
//
// Which is correct in Latin1
LATIN1_ENTITIES.put( "AElig", "198" );
LATIN1_ENTITIES.put( "Aacute", "193" );
LATIN1_ENTITIES.put( "Acirc", "194" );
LATIN1_ENTITIES.put( "Agrave", "192" );
LATIN1_ENTITIES.put( "Aring", "197" );
LATIN1_ENTITIES.put( "Atilde", "195" );
LATIN1_ENTITIES.put( "Auml", "196" );
LATIN1_ENTITIES.put( "Ccedil", "199" );
LATIN1_ENTITIES.put( "ETH", "208" );
LATIN1_ENTITIES.put( "Eacute", "201" );
LATIN1_ENTITIES.put( "Ecirc", "202" );
LATIN1_ENTITIES.put( "Egrave", "200" );
LATIN1_ENTITIES.put( "Euml", "203" );
LATIN1_ENTITIES.put( "Iacute", "205" );
LATIN1_ENTITIES.put( "Icirc", "206" );
LATIN1_ENTITIES.put( "Igrave", "204" );
LATIN1_ENTITIES.put( "Iuml", "207" );
LATIN1_ENTITIES.put( "Ntilde", "209" );
LATIN1_ENTITIES.put( "Oacute", "211" );
LATIN1_ENTITIES.put( "Ocirc", "212" );
LATIN1_ENTITIES.put( "Ograve", "210" );
LATIN1_ENTITIES.put( "Oslash", "216" );
LATIN1_ENTITIES.put( "Otilde", "213" );
LATIN1_ENTITIES.put( "Ouml", "214" );
LATIN1_ENTITIES.put( "THORN", "222" );
LATIN1_ENTITIES.put( "Uacute", "218" );
LATIN1_ENTITIES.put( "Ucirc", "219" );
LATIN1_ENTITIES.put( "Ugrave", "217" );
LATIN1_ENTITIES.put( "Uuml", "220" );
LATIN1_ENTITIES.put( "Yacute", "221" );
LATIN1_ENTITIES.put( "aacute", "225" );
LATIN1_ENTITIES.put( "acirc", "226" );
LATIN1_ENTITIES.put( "aelig", "230" );
LATIN1_ENTITIES.put( "agrave", "224" );
LATIN1_ENTITIES.put( "aring", "229" );
LATIN1_ENTITIES.put( "atilde", "227" );
LATIN1_ENTITIES.put( "auml", "228" );
LATIN1_ENTITIES.put( "ccedil", "231" );
LATIN1_ENTITIES.put( "eacute", "233" );
LATIN1_ENTITIES.put( "ecirc", "234" );
LATIN1_ENTITIES.put( "egrave", "232" );
LATIN1_ENTITIES.put( "eth", "240" );
LATIN1_ENTITIES.put( "euml", "235" );
LATIN1_ENTITIES.put( "iacute", "237" );
LATIN1_ENTITIES.put( "icirc", "238" );
LATIN1_ENTITIES.put( "igrave", "236" );
LATIN1_ENTITIES.put( "iuml", "239" );
LATIN1_ENTITIES.put( "ntilde", "241" );
LATIN1_ENTITIES.put( "oacute", "243" );
LATIN1_ENTITIES.put( "ocirc", "244" );
LATIN1_ENTITIES.put( "ograve", "242" );
LATIN1_ENTITIES.put( "oslash", "248" );
LATIN1_ENTITIES.put( "otilde", "245" );
LATIN1_ENTITIES.put( "ouml", "246" );
LATIN1_ENTITIES.put( "szlig", "223" );
LATIN1_ENTITIES.put( "thorn", "254" );
LATIN1_ENTITIES.put( "uacute", "250" );
LATIN1_ENTITIES.put( "ucirc", "251" );
LATIN1_ENTITIES.put( "ugrave", "249" );
LATIN1_ENTITIES.put( "uuml", "252" );
LATIN1_ENTITIES.put( "yacute", "253" );
LATIN1_ENTITIES.put( "yuml", "255" );
}
}
1.1 jakarta-commons-sandbox/feedparser/tests/anchor/anchor1.html
Index: anchor1.html
===================================================================
<a href="http://peerfear.org">
</a>
1.1 jakarta-commons-sandbox/feedparser/tests/anchor/anchor2.html
Index: anchor2.html
===================================================================
<a href=http://peerfear.org target=foo >
</a>
1.1 jakarta-commons-sandbox/feedparser/tests/anchor/anchor3.html
Index: anchor3.html
===================================================================
<a href=http://peerfear.org>
</a>
1.1 jakarta-commons-sandbox/feedparser/tests/anchor/anchor4.html
Index: anchor4.html
===================================================================
<a target=bar href=http://peerfear.org>
this is the anchor body
</a>
---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org
[prev in list] [next in list] [prev in thread] [next in thread]
Configure |
About |
News |
Add a list |
Sponsored by KoreLogic