package filerogue.server.ramdb; import java.util.ArrayList; import java.util.HashSet; import java.util.Hashtable; import java.util.Iterator; import java.util.logging.Level; import java.util.logging.Logger; import filerogue.*; import filerogue.server.DRM; /** * Title: FileRogue * Description: Global unrestricted file sharing * Copyright: Copyright (c) 2000 * Company: ITP Web Solutions * @author Brian Cairns * @version 1.0 */ public class TripletIndexer implements Indexer { private static final Logger logger = Logger.getLogger( "global" ); RAMDBFunctions ramdb; Hashtable index = new Hashtable( 26 * 26 * 26 ); public TripletIndexer( RAMDBFunctions ram ) { ramdb = ram; } public void addIndexes( CatalogData cat, Flag lifeLine, HashSet bannedExts ) { SubDirData dir; FileData file; HashSet sequenceSet = new HashSet(); String ext; int banCounter; int DRMCounter; for ( int d = 0; lifeLine.isTrue && d < cat.getTotalDirCount(); d++ ) { dir = cat.getSubDir( d ); /** @todo don't add empty directories */ createIndexFor( cat, dir.getName(), dir, sequenceSet ); banCounter = 0; DRMCounter = 0; for ( int f = 0; lifeLine.isTrue && f < dir.getFileCount(); f++ ) { if( ( file = dir.getFileData( f ) ) != null ) { ext = file.getExtension().toLowerCase(); /** @todo check here if file matches digest in MusicDataStore */ if ( bannedExts.contains( ext ) ) { // remove file from catalog dir.removeFileData( f ); banCounter++; } else if ( Version.ENABLE_DRM && !DRM.check( file ) ) { logger.log( Level.INFO, "DRM check failed for: user=" + cat.getOwner() + ", file=" + file ); dir.removeFileData( f ); DRMCounter++; } else { createIndexFor( cat, file.getName(), file, sequenceSet ); } } else logger.log( Level.SEVERE, "NULL file in TripletIndexer.addIndexes(): user=" + cat.getOwner() + ", file#=" + f + ", total#=" + dir.getFileCount() ); } if ( banCounter > 0 || DRMCounter > 0 ) { logger.log( Level.INFO, "Removed " + banCounter + " banned extensions from " + cat + ", dir: " + dir ); logger.log( Level.INFO, "Removed " + DRMCounter + " invalid checksums from " + cat + ", dir: " + dir ); dir.repack(); } } } // the old pre extension-blocking version // // public void addIndexes( CatalogData cat, Flag lifeLine ) // { // SubDirData dir; // FileData file; // HashSet sequenceSet = new HashSet(); // // for( int d = 0; lifeLine.isTrue && d < cat.getTotalDirCount(); d++ ) // { // dir = cat.getSubDir( d ); // createIndexFor( cat, dir.getName(), dir, sequenceSet ); // for( int f = 0; lifeLine.isTrue && f < dir.getFileCount(); f++ ) // { // file = dir.getFileData( f ); // createIndexFor( cat, file.getName(), file, sequenceSet ); // } // } // } public void removeIndexes( CatalogData cat, Flag lifeLine ) { // long time = System.currentTimeMillis(); Index tripletIndex = null; synchronized ( index ) { for ( Iterator iter = index.values().iterator(); lifeLine.isTrue && iter.hasNext(); ) { if ( ( tripletIndex = ( Index ) iter.next() ) != null ) { tripletIndex.removeEntries( cat.getOwner() ); if ( tripletIndex.getCount() == 0 ) { iter.remove(); } } } } // time = System.currentTimeMillis() - time; // logger.log( Level.FINE, "TripletIndexer.removeIndexes( " + cat.getOwner() + ", " + cat.getTotalFileCount() + " files ) took " + time + " ms" ); } public int getIndexSize( String triplet ) { Index i = ( Index ) index.get( triplet ); return i == null ? 0 : i.getCount(); } public int getTotalEntriesContaining( String term ) { if ( term.length() == 3 ) { return getIndexSize( term ); } int total = 0; String key = ""; synchronized ( index ) { for ( Iterator iter = index.keySet().iterator(); iter.hasNext(); ) { key = ( String ) iter.next(); if ( key.indexOf( term ) != -1 ) { total += ( ( Index ) index.get( key ) ).getCount(); } } } return total; } public SizedIterator getIterator( DBSearchRequest req ) { // check for single directory listing if ( req.getOwnerID() != -1 && req.getPathID() != -1 ) { return new SingleDirectoryIterator( ramdb, req ); } // ok, no single directory listing. now we try to figure out which is the best (smallest) index(es) to use MetaIndex extensionIndex = getExtensionMetaIndex( req ); MetaIndex tripletIndex = getSmallestTripletMetaIndex( req ); // System.out.println( "extensionIndex: " + ( extensionIndex == null ? "NULL" : String.valueOf( extensionIndex.getTotalCount() ) ) ); // System.out.println( "tripletIndex: " + ( tripletIndex == null ? "NULL" : String.valueOf( tripletIndex.getTotalCount() ) ) ); // decide which index to use MetaIndex finalIndex; if ( extensionIndex == null ) { if ( tripletIndex == null ) { logger.log( Level.FINE, "TripletIndexer.getIterator() - Using AllIterator" ); return new AllIterator( ramdb, req ); // if both are null, we have to search ALL files } else { logger.log( Level.FINE, "TripletIndexer.getIterator() - No extension data, using TripletIndex: " + tripletIndex ); finalIndex = tripletIndex; } } else { if ( tripletIndex == null ) { logger.log( Level.FINE, "TripletIndexer.getIterator() - No triplet data, using ExtensionIndex: " + extensionIndex ); finalIndex = extensionIndex; } else { logger.log( Level.FINE, "TripletIndexer.getIterator() - Using " + ( extensionIndex.getTotalCount() < tripletIndex.getTotalCount() ? "ExtensionIndex: " + extensionIndex : "TripletIndex: " + tripletIndex ) ); finalIndex = extensionIndex.getTotalCount() < tripletIndex.getTotalCount() ? extensionIndex : tripletIndex; } } if ( finalIndex.getTotalCount() == 0 ) { return new EmptyIterator(); } return new MetaIndexIterator( ramdb, req, finalIndex ); } private MetaIndex getSmallestTripletMetaIndex( DBSearchRequest req ) { MetaIndex meta = new MetaIndex(); // in order, look for triplets, pairs, single chars (if previous size is not found) ArrayList sequences = new ArrayList(); for ( byte c = 3; c >= 1 && sequences.size() == 0; c-- ) { sequences = getAllSequencesFrom( req, c ); // if sequences is still zero, there are no alphabetic characters in the query keywords } if ( sequences.size() > 0 ) { // find out which of these possible sequences yields the fewest results String rarestSequence = ""; int lowestCount = Integer.MAX_VALUE, currentCount; for ( int i = 0; i < sequences.size(); i++ ) { currentCount = getTotalEntriesContaining( ( String ) sequences.get( i ) ); if ( currentCount < lowestCount ) { rarestSequence = ( String ) sequences.get( i ); lowestCount = currentCount; } } // now add all possible indexes for this sequence to the meta index if ( lowestCount > 0 ) { buildMetaIndex( meta, rarestSequence ); } } else { return null; } return meta; } private void buildMetaIndex( MetaIndex target, String sequence ) { if ( sequence.length() == 3 ) { target.addIndex( ( Index ) index.get( sequence ) ); return; } else { String key = ""; synchronized ( index ) { for ( Iterator iter = index.keySet().iterator(); iter.hasNext(); ) { key = ( String ) iter.next(); if ( key.indexOf( sequence ) != -1 ) { target.addIndex( ( Index ) index.get( key ) ); } } } } } private void output( ArrayList list ) { if ( list == null ) { System.out.println( "NULL" ); } else { for ( int i = 0; i < list.size(); i++ ) { System.out.print( list.get( i ) + " " ); } System.out.println(); } } private MetaIndex getExtensionMetaIndex( DBSearchRequest req ) { String[] exts = req.getExts(); if ( exts != null ) { MetaIndex meta = new MetaIndex(); for ( int i = 0; i < exts.length; i++ ) { meta.addIndex( ramdb.getExtensionIndexer().getIndex( exts[ i ] ) ); } return meta; } return null; } private String getRarestTriplet( ArrayList triplets ) { String rarest = "", currentTriplet; int rareCount = Integer.MAX_VALUE, currentCount; for ( int i = 0; i < triplets.size(); i++ ) { currentTriplet = ( String ) triplets.get( i ); currentCount = getIndexSize( currentTriplet ); // System.out.println( "'" + currentTriplet + "', " + currentCount + " entries" ); if ( currentCount < rareCount ) { rarest = currentTriplet; rareCount = currentCount; } } // System.out.println( "Using '" + rarest + "'" ); return rarest; } private ArrayList getAllSequencesFrom( DBSearchRequest req, byte chars ) { ArrayList tripletList = new ArrayList(); buildSequencesList( req.getFileKeys(), tripletList, chars ); if ( !req.isPathLikeFile() ) { buildSequencesList( req.getPathKeys(), tripletList, chars ); } return tripletList; } private void buildSequencesList( String[] terms, ArrayList list, byte chars ) { byte sequentialLetters; if ( terms != null ) { for ( int t = 0; t < terms.length; t++ ) { if ( terms[ t ] != null && terms[ t ].length() > 0 && terms[ t ].charAt( 0 ) != '-' ) { sequentialLetters = 0; for ( int c = 0; c < terms[ t ].length(); c++ ) { if ( !isCharIndexable( terms[ t ].charAt( c ) ) ) { sequentialLetters = 0; } else if ( ++sequentialLetters >= chars ) { list.add( terms[ t ].substring( c - chars + 1, c + 1 ).toLowerCase() ); // if( !Character.isLetter( terms[ t ].charAt( c ) ) ) // sequentialLetters = 0; // else // if( ++sequentialLetters >= chars ) list.add( terms[ t ].substring( c - chars + 1, c + 1 ).toLowerCase() ); } } } } } } // function to determine if a character will be indexed private static boolean isCharIndexable( char c ) { return ( Character.isLetterOrDigit( c ) || c == '.' || c == '&' ); } /** * Creates index entries for all triplets in the term. * * uniqueSequenceList is used as a "scratch pad" to make sure we are not indexing a term more than once * the reason it is passed instead of created within is to avoid creating thousands of temporary HashSets for a single * catalog insertion, just create one (in addIndexes) and have this method use it over and over again */ private void createIndexFor( CatalogData cat, String term, FileList list, HashSet uniqueSequenceList ) { String triplet; byte sequentialLetters = 0; Index tripletIndex; ArrayList catalogListing; uniqueSequenceList.clear(); for ( int i = 0; i < term.length(); i++ ) { if ( isCharIndexable( term.charAt( i ) ) ) // if( Character.isLetter( term.charAt( i ) ) ) { sequentialLetters++; if ( sequentialLetters >= 3 ) { triplet = term.substring( i - 2, i + 1 ).toLowerCase(); if ( uniqueSequenceList.add( triplet ) ) // make sure we aren't indexing the same triplet more than once for this term { tripletIndex = ( Index ) index.get( triplet ); if ( tripletIndex == null ) { tripletIndex = new Index( triplet ); index.put( triplet, tripletIndex ); } tripletIndex.addEntry( cat.getOwner(), list ); } } } else { sequentialLetters = 0; } } } // call this routine with extreme caution... it produces a SHITLOAD of output. Should only be used with very small test catalogs public void dumpSummary() { String key = ""; Index i; int counter = 0; System.out.println( "*** INDEX SUMMARY ***************************************************************************" ); synchronized ( index ) { for ( Iterator iter = index.keySet().iterator(); iter.hasNext(); ) { key = ( String ) iter.next(); i = ( Index ) index.get( key ); if ( i != null ) { System.out.print( key + "(" + i.getCount() + ") " ); } if ( ++counter % 30 == 0 ) { System.out.println(); } } } System.out.println(); } }