[prev in list] [next in list] [prev in thread] [next in thread] 

List:       jakarta-commons-dev
Subject:    cvs commit: jakarta-commons-sandbox/feedparser/tests/anchor anchor1.html anchor2.html anchor3.html a
From:       burton () apache ! org
Date:       2004-08-31 21:02:40
Message-ID: 20040831210240.48761.qmail () minotaur ! apache ! org
[Download RAW message or body]

burton      2004/08/31 14:02:40

  Added:       feedparser/src/java/org/apache/commons/feedparser
                        FeedFilter.java
               feedparser/tests/anchor anchor1.html anchor2.html
                        anchor3.html anchor4.html
  Log:
  filter and tests supports
  
  Revision  Changes    Path
  1.1                  \
jakarta-commons-sandbox/feedparser/src/java/org/apache/commons/feedparser/FeedFilter.java
  
  Index: FeedFilter.java
  ===================================================================
  /*
   * Copyright 1999,2004 The Apache Software Foundation.
   * 
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   * 
   *      http://www.apache.org/licenses/LICENSE-2.0
   * 
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  package org.apache.commons.feedparser;
  
  import java.io.*;
  import java.net.*;
  import java.util.*;
  import java.util.regex.*;
  
  /**
   *
   * @author <a href="mailto:burton@apache.org">Kevin A. Burton (burtonator)</a>
   * @version $Id: FeedFilter.java,v 1.1 2004/08/31 21:02:40 burton Exp $
   */
  public class FeedFilter {
  
      public static HashMap LATIN1_ENTITIES = new HashMap();
  
      private static Pattern entity_pattern = Pattern.compile( "&([a-zA-Z]+);" );
      
      public static byte[] parse( byte[] bytes ) {
  
          String content = new String( bytes );
  
          //remove leading prolog...
  
          content = doRemoveLeadingProlog( content );
          content = doDecodeEntities( content );
          
          return content.getBytes();
          
      }
  
      /**
       * Removing prolog whitespace, comments, and other garbage from the
       * beginning of a feed.
       *
       * @author <a href="mailto:burton@rojo.com">Kevin A. Burton</a>
       */
      private static String doRemoveLeadingProlog( String content ) {
  
          //move to the beginning of the first element or comment.  When this is a
          //processing instruction we will move to that
          int begin = content.indexOf( "<" );
  
          if ( begin > 0 )
              content = content.substring( begin, content.length() );
  
          //now skip to the XML processing instruction when necessary.
          
          begin = content.indexOf( "<?xml" );
  
          if ( begin > 0 )
              content = content.substring( begin, content.length() );
  
          return content;
          
      }
  
      private static String doDecodeEntities( String content ) {
  
          StringBuffer buff = new StringBuffer( content.length() + 1000 );
  
          Matcher m = entity_pattern.matcher( content );
  
          int begin = 0;
  
          while ( m.find() ) {
  
              buff.append( content.substring( begin, m.start() ) );
              
              String entity = m.group( 1 );
  
              String value = (String)LATIN1_ENTITIES.get( entity );
  
              if ( value != null ) {
                  buff.append( "&#" );
                  buff.append( value );
                  buff.append( ";" );
              } else {
                  buff.append( "&" );
                  buff.append( entity );
                  buff.append( ";" );
              }
  
              begin = m.end( 0 );
              
          } 
  
          buff.append( content.substring( begin, content.length() ) );
  
          return buff.toString();
          
      }
      
      public static void main( String[] args ) {
  
          byte[] b = parse( "hello &eacute; world".getBytes() );
  
          String v = new String( b );
  
          System.out.println( "v: " + v );
          
      }
      
      static {
  
          // load the latin1 entity map.  We will replace latin1 entities with
          // their char references directly.  For example if someone incorrectly
          // references:
          //
          // &auml;
          //
          // we replace it with:
          //
          // &#228;
          //
          // Which is correct in Latin1
  
          LATIN1_ENTITIES.put( "AElig",     "198" );
          LATIN1_ENTITIES.put( "Aacute",    "193" );
          LATIN1_ENTITIES.put( "Acirc",     "194" );
          LATIN1_ENTITIES.put( "Agrave",    "192" );
          LATIN1_ENTITIES.put( "Aring",     "197" );
          LATIN1_ENTITIES.put( "Atilde",    "195" );
          LATIN1_ENTITIES.put( "Auml",      "196" );
          LATIN1_ENTITIES.put( "Ccedil",    "199" );
          LATIN1_ENTITIES.put( "ETH",       "208" );
          LATIN1_ENTITIES.put( "Eacute",    "201" );
          LATIN1_ENTITIES.put( "Ecirc",     "202" );
          LATIN1_ENTITIES.put( "Egrave",    "200" );
          LATIN1_ENTITIES.put( "Euml",      "203" );
          LATIN1_ENTITIES.put( "Iacute",    "205" );
          LATIN1_ENTITIES.put( "Icirc",     "206" );
          LATIN1_ENTITIES.put( "Igrave",    "204" );
          LATIN1_ENTITIES.put( "Iuml",      "207" );
          LATIN1_ENTITIES.put( "Ntilde",    "209" );
          LATIN1_ENTITIES.put( "Oacute",    "211" );
          LATIN1_ENTITIES.put( "Ocirc",     "212" );
          LATIN1_ENTITIES.put( "Ograve",    "210" );
          LATIN1_ENTITIES.put( "Oslash",    "216" );
          LATIN1_ENTITIES.put( "Otilde",    "213" );
          LATIN1_ENTITIES.put( "Ouml",      "214" );
          LATIN1_ENTITIES.put( "THORN",     "222" );
          LATIN1_ENTITIES.put( "Uacute",    "218" );
          LATIN1_ENTITIES.put( "Ucirc",     "219" );
          LATIN1_ENTITIES.put( "Ugrave",    "217" );
          LATIN1_ENTITIES.put( "Uuml",      "220" );
          LATIN1_ENTITIES.put( "Yacute",    "221" );
          LATIN1_ENTITIES.put( "aacute",    "225" );
          LATIN1_ENTITIES.put( "acirc",     "226" );
          LATIN1_ENTITIES.put( "aelig",     "230" );
          LATIN1_ENTITIES.put( "agrave",    "224" );
          LATIN1_ENTITIES.put( "aring",     "229" );
          LATIN1_ENTITIES.put( "atilde",    "227" );
          LATIN1_ENTITIES.put( "auml",      "228" );
          LATIN1_ENTITIES.put( "ccedil",    "231" );
          LATIN1_ENTITIES.put( "eacute",    "233" );
          LATIN1_ENTITIES.put( "ecirc",     "234" );
          LATIN1_ENTITIES.put( "egrave",    "232" );
          LATIN1_ENTITIES.put( "eth",       "240" );
          LATIN1_ENTITIES.put( "euml",      "235" );
          LATIN1_ENTITIES.put( "iacute",    "237" );
          LATIN1_ENTITIES.put( "icirc",     "238" );
          LATIN1_ENTITIES.put( "igrave",    "236" );
          LATIN1_ENTITIES.put( "iuml",      "239" );
          LATIN1_ENTITIES.put( "ntilde",    "241" );
          LATIN1_ENTITIES.put( "oacute",    "243" );
          LATIN1_ENTITIES.put( "ocirc",     "244" );
          LATIN1_ENTITIES.put( "ograve",    "242" );
          LATIN1_ENTITIES.put( "oslash",    "248" );
          LATIN1_ENTITIES.put( "otilde",    "245" );
          LATIN1_ENTITIES.put( "ouml",      "246" );
          LATIN1_ENTITIES.put( "szlig",     "223" );
          LATIN1_ENTITIES.put( "thorn",     "254" );
          LATIN1_ENTITIES.put( "uacute",    "250" );
          LATIN1_ENTITIES.put( "ucirc",     "251" );
          LATIN1_ENTITIES.put( "ugrave",    "249" );
          LATIN1_ENTITIES.put( "uuml",      "252" );
          LATIN1_ENTITIES.put( "yacute",    "253" );
          LATIN1_ENTITIES.put( "yuml",      "255" );
  
      }
      
  }
  
  
  
  1.1                  jakarta-commons-sandbox/feedparser/tests/anchor/anchor1.html
  
  Index: anchor1.html
  ===================================================================
  
  <a href="http://peerfear.org">
  
  </a>
  
  
  1.1                  jakarta-commons-sandbox/feedparser/tests/anchor/anchor2.html
  
  Index: anchor2.html
  ===================================================================
  
  <a href=http://peerfear.org target=foo >
  
   </a>
  
  
  1.1                  jakarta-commons-sandbox/feedparser/tests/anchor/anchor3.html
  
  Index: anchor3.html
  ===================================================================
  
  <a href=http://peerfear.org>
  
   </a>
  
  
  1.1                  jakarta-commons-sandbox/feedparser/tests/anchor/anchor4.html
  
  Index: anchor4.html
  ===================================================================
  
  <a target=bar href=http://peerfear.org>
  
      this is the anchor body
  
  </a>
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: commons-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: commons-dev-help@jakarta.apache.org


[prev in list] [next in list] [prev in thread] [next in thread] 

Configure | About | News | Add a list | Sponsored by KoreLogic