Search Engine


Table of Contents
1. Documentation
2. Source Code
3. Sample Output


Introduction:
The program constructs an inverted index and ranks documents in a collection using the B25 algorithm (B25 method) or Query-likelihood coupled with Dirichlet smoothing (QL method).

How To Use:
1. Have tccorpus.txt in the same directory as the .java files. Run the program. It will start by constructing the inverted index.
2. The program will now ask for a mu value (used only for Dirchlet smoothing). Enter a mu value. The program will now as for queries in a loop, outputting the top 10 results. You may enter as many queries as you like to the program for this mu value. When you are done, just press “enter” and the program will terminate. All the output result from this session will be written to a file called “output.txt”. To enter a new mu value, restart the program.

Notes concerning mu:
“Small values of mu give more importance to the relative weighting of words and large values favor the number of matching terms” (Croft et al 259). Increasing mu too much puts too much importance on how many times a word appears in the collection and as a result, skews the weight of each word and cancel out the effect of the document length. As you can see from the denominator of the equation, which is doc_length + mu, if mu is very small, then the doc_length will have a large effect on the ranking but if mu is very large, then the doc_length is negligible.


Source Code

SearchEngine.java

  import java.io.*;
import java.util.*;
import java.util.Map.Entry;


public class SearchEngine {
  
  static Map<String,Term> map = new HashMap<String,Term>();
  static Map<String, Integer> docs = new HashMap<String, Integer>();
  //static Map<String, Double> b25 = new HashMap<String, Double>();
  static Map<String, Double> QL = new HashMap<String, Double>();
  
  public static boolean ASC = true;
    public static boolean DESC = false;
  
  static double k1=1.2;
  static double b=.75;
  static double k2=100;
  
  static void printMap(Map<String,Term> map){

    System.out.println("*****START*****");
    for (String name: map.keySet()){

      String key = name.toString();
      String value = map.get(name).toString();  
      System.out.println(key + " " + map.get(key).getTotOccurance());  


    }
    System.out.println("*****END****");
  }
  
  
  static int getTotWords(){
    int tot=0;
    for (String name: map.keySet()){
      String key = name.toString();
      tot+=map.get(key).getTotOccurance();
    }
    return tot;
  }
  
  static double QL(String query, String current, int mu){
    String [] q = query.split(" ");
    
    double fqid=0;
    double cqi=0;
    double lcl= getTotWords();
    double ldl=docs.get(current);
    
    double total=0;
    double log=0;
    
    for(int i=0; i<q.length; i++){
      if(map.get(q[i])!=null){
        fqid=(double)map.get(q[i]).getDocOccurances(current);
        cqi = map.get(q[i]).getTotOccurance();
      }
      else{
        fqid=0;
        cqi=0.000000000000000001;
      }
      double top=(double)fqid+(mu*(cqi/lcl));
      double bottom=ldl+mu;
      double divide=top/bottom;
      if(divide!=0)
        log=Math.log(divide);
      else{
        log=0;
      }
      total=total+log;
    }
    
    return total;
  }
  
  static double B25(String query, String current){
    String[] q=query.split(" ");
    
    double ranking=0;
    double totalvar=0;
    int f1,f2;
    double avdl = getTotWords()/docs.size();
    double dlavdl = docs.get(current)/avdl;
    
    double K = 1.2*(.25+.75*dlavdl);
    
    
    for (String word: q) {
      totalvar=0;
      
      if(map.get(word)!=null){
        f1 = map.get(word).getDocOccurances(current);
        f2 = map.get(word).docs.size();
      }
      else{
        f1=0;
        f2=0;
      }
  
      double var1 = 1/((f2+0.5)/(docs.size()-f2+0.5));
      var1 = Math.log(var1);
      double var2 = ((1.2+1)*f1)/(K+f1);
      double var3 = ((k2+1)*1)/(k2+1);
      
      if (var2==0)
        totalvar=var1*var3;
      else if (var3==0)
        totalvar=var1*var2;
      else
        totalvar=var1*var2*var3;
      
      ranking+=totalvar;
      //System.out.println(ranking);
      
    }
    return ranking;
  }
  
  private static Map<String, Double> sortByComparator(Map<String, Double> b252, final boolean order){
        List<Entry<String, Double>> list = new LinkedList<Entry<String, Double>>(b252.entrySet());
        // Sorting the list based on values
        Collections.sort(list, new Comparator<Entry<String, Double>>()
        {
            public int compare(Entry<String, Double> o1,
                    Entry<String, Double> o2)
            {
                if (order)
                    return o1.getValue().compareTo(o2.getValue());
                else
                    return o2.getValue().compareTo(o1.getValue());
            }
        });

        // Maintaining insertion order with the help of LinkedList
        Map<String, Double> sortedMap = new LinkedHashMap<String, Double>();
        for (Entry<String, Double> entry : list){
            sortedMap.put(entry.getKey(), entry.getValue());
        }

        return sortedMap;
    }

  public static void main(String[] args) throws Exception{

    BufferedReader read = new BufferedReader(new FileReader("tccorpus.txt"));
        
    String line = read.readLine();
    while(line!=null && line.startsWith("#")){
      String[] docID=line.split(" ");
      docs.put(docID[1],0);

      line=read.readLine();
      while(line!=null && !line.startsWith("#")){
        String[] tokens = line.split(" ");
        docs.put(docID[1],docs.get(docID[1])+tokens.length);
        
        for(String a : tokens){
          Term word = map.get(a);
          if (word==null)
            word = new Term(a);
          word.addOccurance(docID[1]);
          map.put(word.getTerm(), word);
        }
        line=read.readLine();
      }
    }
    
    //printMap(map); 
    System.out.println("Enter a value for mu");
    Scanner in = new Scanner(System.in);
    int mu = Integer.parseInt(in.nextLine());
    
    System.out.println("Enter a query");
    Scanner sc = new Scanner(System.in);
    String query = sc.nextLine();
    
    File f = new File("output.txt");
        FileWriter fr = new FileWriter(f);
        BufferedWriter br  = new BufferedWriter(fr);
        
        int count =1;
    while(query!=null && !query.equals("")){

      for (String name: docs.keySet()){
        String key = name.toString();
        double rank = QL(query, key, mu);
        QL.put(name,rank);
        //b25.put(name, rank);
        //System.out.println(key+ " " +rank);
      }
      
      Map<String, Double> sortedMapDesc = sortByComparator(QL, DESC);
          //Map<String, Double> sortedMapDesc = sortByComparator(b25, DESC);
  
          int counter=1;
          
          
         for(String key: sortedMapDesc.keySet()){
           String a =count+" Q0 " + key + " " +counter+ " " + sortedMapDesc.get(key) + " my_computer\n";
            System.out.println(a);
            br.write(a);
            if(counter==10)
              break;
            counter++;
          }
          
          System.out.println("\nEnter a query");
          sc = new Scanner(System.in);
      query = sc.nextLine();
      count++;
      
    }
    
    System.out.println("Done.");
    br.close();
    System.exit(0);
  
    
  }

}

Term.java

  import java.util.*;


public class Term {
  
  private String term;
  private int occurance;
  //Set<String> docIDs;
  Map<String, Integer> docs = new HashMap<String,Integer>();
  
  public Term(String t){
    term=t;
    occurance=0;
  }

  String getTerm(){
    return term;
  }
  
  int getTotOccurance(){
    return occurance;
  }
  
  void addOccurance(String id){
    occurance++;
    if(docs.get(id)==null)
      docs.put(id, 1);
    else
      docs.put(id, docs.get(id)+1);
    
  }
  
  int getDocOccurances(String id){
    
    if(docs.get(id)==null)
      return 0;
    else
      return docs.get(id);
  }
  

}

Sample Output

1 Q0 3127 1 -12.578099587442356 my_computer
1 Q0 2246 2 -15.788461051252689 my_computer
1 Q0 3196 3 -16.28569574178777 my_computer
1 Q0 2593 4 -18.294473362977353 my_computer
1 Q0 1930 5 -18.340578488773424 my_computer
1 Q0 1591 6 -18.865648666926806 my_computer
1 Q0 1680 7 -18.90495174983262 my_computer
1 Q0 1033 8 -19.16798111885309 my_computer
1 Q0 3068 9 -19.20360753586875 my_computer
1 Q0 1236 10 -19.33953387887789 my_computer
2 Q0 2748 1 -26.070768485277668 my_computer
2 Q0 2491 2 -27.544057511246972 my_computer
2 Q0 2701 3 -28.148745244953403 my_computer
2 Q0 3005 4 -28.308906328564717 my_computer
2 Q0 2530 5 -28.42444276401577 my_computer
2 Q0 1795 6 -28.44378303123148 my_computer
2 Q0 2897 7 -28.5080504171301 my_computer
2 Q0 2495 8 -28.559224154699248 my_computer
2 Q0 1886 9 -28.771695654481583 my_computer
2 Q0 2716 10 -28.786081861760046 my_computer
3 Q0 2714 1 -7.838047225791559 my_computer
3 Q0 2266 2 -8.174497005076066 my_computer
3 Q0 2973 3 -8.193928105190366 my_computer
3 Q0 950 4 -8.580145736407529 my_computer
3 Q0 3075 5 -8.633413829421835 my_computer
3 Q0 3156 6 -8.671054486189593 my_computer
3 Q0 2433 7 -8.697879664142242 my_computer
3 Q0 2289 8 -9.040347590909551 my_computer
3 Q0 2785 9 -9.102414920037692 my_computer
3 Q0 2401 10 -9.14414737053538 my_computer
4 Q0 2949 1 -74.35917123957027 my_computer
4 Q0 2406 2 -74.38171704936771 my_computer
4 Q0 2914 3 -74.43505502065167 my_computer
4 Q0 2250 4 -74.81305630648704 my_computer
4 Q0 2926 5 -75.09025223246199 my_computer
4 Q0 2276 6 -75.09655271594633 my_computer
4 Q0 2454 7 -75.13117858841757 my_computer
4 Q0 2969 8 -75.17034909591032 my_computer
4 Q0 241 9 -75.27339585073642 my_computer
4 Q0 2212 10 -75.38345431109488 my_computer
5 Q0 1194 1 -19.039846685073513 my_computer
5 Q0 1410 2 -19.178483670462647 my_computer
5 Q0 2535 3 -19.3417892901277 my_computer
5 Q0 268 4 -19.601453859764945 my_computer
5 Q0 1696 5 -19.62826767460359 my_computer
5 Q0 2065 6 -20.161575073177328 my_computer
5 Q0 1233 7 -20.375758514200175 my_computer
5 Q0 3120 8 -20.475841101919084 my_computer
5 Q0 20 9 -20.6097962311195 my_computer
5 Q0 1359 10 -20.712644864637667 my_computer
6 Q0 2318 1 -26.590201858150557 my_computer
6 Q0 3048 2 -28.084391793856152 my_computer
6 Q0 3070 3 -29.99680058834152 my_computer
6 Q0 2542 4 -31.37400583154529 my_computer
6 Q0 3089 5 -31.597154640671814 my_computer
6 Q0 2984 6 -31.641124764146582 my_computer
6 Q0 2894 7 -31.74476025927452 my_computer
6 Q0 2812 8 -32.14054364400156 my_computer
6 Q0 3119 9 -32.63212398411255 my_computer
6 Q0 2882 10 -32.761321866100765 my_computer
7 Q0 2967 1 -26.29051923253745 my_computer
7 Q0 1811 2 -27.837552400193392 my_computer
7 Q0 2973 3 -28.593592920112595 my_computer
7 Q0 2664 4 -28.6313140164122 my_computer
7 Q0 1457 5 -28.882069655494767 my_computer
7 Q0 2714 6 -28.966613771284944 my_computer
7 Q0 3156 7 -29.385172638823796 my_computer
7 Q0 2114 8 -29.41867641474627 my_computer
7 Q0 891 9 -29.473835157932925 my_computer
7 Q0 3075 10 -29.493120336426344 my_computer