/usr/include/xapian-1.3/xapian/weight.h

/** @file weight.h
 * @brief Weighting scheme API.
 */
/* Copyright (C) 2007,2008,2009,2010,2011,2012,2015 Olly Betts
 * Copyright (C) 2009 Lemur Consulting Ltd
 * Copyright (C) 2013,2014 Aarsh Shah
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
 */

#ifndef XAPIAN_INCLUDED_WEIGHT_H
#define XAPIAN_INCLUDED_WEIGHT_H

#include <string>

#include <xapian/types.h>
#include <xapian/visibility.h>

namespace Xapian {

/** Abstract base class for weighting schemes. */
class XAPIAN_VISIBILITY_DEFAULT Weight {
  protected:
    /// Stats which the weighting scheme can use (see @a need_stat()).
    typedef enum {
	/// Number of documents in the collection.
	COLLECTION_SIZE = 1,
	/// Number of documents in the RSet.
	RSET_SIZE = 2,
	/// Average length of documents in the collection.
	AVERAGE_LENGTH = 4,
	/// How many documents the current term is in.
	TERMFREQ = 8,
	/// How many documents in the RSet the current term is in.
	RELTERMFREQ = 16,
	/// Sum of wqf for terms in the query.
	QUERY_LENGTH = 32,
	/// Within-query-frequency of the current term.
	WQF = 64,
	/// Within-document-frequency of the current term in the current document.
	WDF = 128,
	/// Length of the current document (sum wdf).
	DOC_LENGTH = 256,
	/// Lower bound on (non-zero) document lengths.
	DOC_LENGTH_MIN = 512,
	/// Upper bound on document lengths.
	DOC_LENGTH_MAX = 1024,
	/// Upper bound on wdf.
	WDF_MAX = 2048,
	/// Sum of wdf over the whole collection for the current term.
	COLLECTION_FREQ = 4096,
	/// Number of unique terms in the current document.
	UNIQUE_TERMS = 8192
    } stat_flags;

    /** Tell Xapian that your subclass will want a particular statistic.
     *
     *  Some of the statistics can be costly to fetch or calculate, so
     *  Xapian needs to know which are actually going to be used.  You
     *  should call need_stat() from your constructor for each such
     *  statistic.
     *
     * @param flag  The stat_flags value for a required statistic.
     */
    void need_stat(stat_flags flag) {
	stats_needed = stat_flags(stats_needed | flag);
    }

    /** Allow the subclass to perform any initialisation it needs to.
     *
     *  @param factor	  Any scaling factor (e.g. from OP_SCALE_WEIGHT).
     *			  If the Weight object is for the term-independent
     *			  weight supplied by get_sumextra()/get_maxextra(),
     *			  then init(0.0) is called (starting from Xapian
     *			  1.2.11 and 1.3.1 - earlier versions failed to
     *			  call init() for such Weight objects).
     */
    virtual void init(double factor) = 0;

  private:
    /// Don't allow assignment.
    void operator=(const Weight &);

    /// A bitmask of the statistics this weighting scheme needs.
    stat_flags stats_needed;

    /// The number of documents in the collection.
    Xapian::doccount collection_size_;

    /// The number of documents marked as relevant.
    Xapian::doccount rset_size_;

    /// The average length of a document in the collection.
    Xapian::doclength average_length_;

    /// The number of documents which this term indexes.
    Xapian::doccount termfreq_;

    // The collection frequency of the term.
    Xapian::termcount collectionfreq_;

    /// The number of relevant documents which this term indexes.
    Xapian::doccount reltermfreq_;

    /// The length of the query.
    Xapian::termcount query_length_;

    /// The within-query-frequency of this term.
    Xapian::termcount wqf_;

    /// A lower bound on the minimum length of any document in the database.
    Xapian::termcount doclength_lower_bound_;

    /// An upper bound on the maximum length of any document in the database.
    Xapian::termcount doclength_upper_bound_;

    /// An upper bound on the wdf of this term.
    Xapian::termcount wdf_upper_bound_;

  public:

    /// Default constructor, needed by subclass constructors.
    Weight() : stats_needed() { }

    /** Type of smoothing to use with the Language Model Weighting scheme.
     *
     *  Default is TWO_STAGE_SMOOTHING.
     */
    typedef enum {
	TWO_STAGE_SMOOTHING = 1,
	DIRICHLET_SMOOTHING = 2,
	ABSOLUTE_DISCOUNT_SMOOTHING = 3,
	JELINEK_MERCER_SMOOTHING = 4
    } type_smoothing;

    class Internal;

    /** Virtual destructor, because we have virtual methods. */
    virtual ~Weight();

    /** Clone this object.
     *
     *  This method allocates and returns a copy of the object it is called on.
     *
     *  If your subclass is called FooWeight and has parameters a and b, then
     *  you would implement FooWeight::clone() like so:
     *
     *  FooWeight * FooWeight::clone() const { return new FooWeight(a, b); }
     *
     *  Note that the returned object will be deallocated by Xapian after use
     *  with "delete".  If you want to handle the deletion in a special way
     *  (for example when wrapping the Xapian API for use from another
     *  language) then you can define a static <code>operator delete</code>
     *  method in your subclass as shown here:
     *  http://trac.xapian.org/ticket/554#comment:1
     */
    virtual Weight * clone() const = 0;

    /** Return the name of this weighting scheme.
     *
     *  This name is used by the remote backend.  It is passed along with the
     *  serialised parameters to the remote server so that it knows which class
     *  to create.
     *
     *  Return the full namespace-qualified name of your class here - if
     *  your class is called FooWeight, return "FooWeight" from this method
     *  (Xapian::BM25Weight returns "Xapian::BM25Weight" here).
     *
     *  If you don't want to support the remote backend, you can use the
     *  default implementation which simply returns an empty string.
     */
    virtual std::string name() const;

    /** Return this object's parameters serialised as a single string.
     *
     *  If you don't want to support the remote backend, you can use the
     *  default implementation which simply throws Xapian::UnimplementedError.
     */
    virtual std::string serialise() const;

    /** Unserialise parameters.
     *
     *  This method unserialises parameters serialised by the @a serialise()
     *  method and allocates and returns a new object initialised with them.
     *
     *  If you don't want to support the remote backend, you can use the
     *  default implementation which simply throws Xapian::UnimplementedError.
     *
     *  Note that the returned object will be deallocated by Xapian after use
     *  with "delete".  If you want to handle the deletion in a special way
     *  (for example when wrapping the Xapian API for use from another
     *  language) then you can define a static <code>operator delete</code>
     *  method in your subclass as shown here:
     *  http://trac.xapian.org/ticket/554#comment:1
     *
     *  @param serialised	A string containing the serialised parameters.
     */
    virtual Weight * unserialise(const std::string & serialised) const;

    /** Calculate the weight contribution for this object's term to a document.
     *
     *  The parameters give information about the document which may be used
     *  in the calculations:
     *
     *  @param wdf    The within document frequency of the term in the document.
     *  @param doclen The document's length (unnormalised).
     *  @param uniqterms	Number of unique terms in the document (used
     *				for absolute smoothing).
     */
    virtual double get_sumpart(Xapian::termcount wdf,
			       Xapian::termcount doclen,
			       Xapian::termcount uniqterms) const = 0;

    /** Return an upper bound on what get_sumpart() can return for any document.
     *
     *  This information is used by the matcher to perform various
     *  optimisations, so strive to make the bound as tight as possible.
     */
    virtual double get_maxpart() const = 0;

    /** Calculate the term-independent weight component for a document.
     *
     *  The parameter gives information about the document which may be used
     *  in the calculations:
     *
     *  @param doclen The document's length (unnormalised).
     *  @param uniqterms The number of unique terms in the document.
     */
    virtual double get_sumextra(Xapian::termcount doclen,
				Xapian::termcount uniqterms) const = 0;

    /** Return an upper bound on what get_sumextra() can return for any
     *  document.
     *
     *  This information is used by the matcher to perform various
     *  optimisations, so strive to make the bound as tight as possible.
     */
    virtual double get_maxextra() const = 0;

    /** @private @internal Initialise this object to calculate weights for term
     *  @a term.
     *
     *  @param stats	  Source of statistics.
     *  @param query_len_ Query length.
     *  @param term	  The term for the new object.
     *  @param wqf_	  The within-query-frequency of @a term.
     *  @param factor	  Any scaling factor (e.g. from OP_SCALE_WEIGHT).
     */
    void init_(const Internal & stats, Xapian::termcount query_len_,
	       const std::string & term, Xapian::termcount wqf_,
	       double factor);

    /** @private @internal Initialise this object to calculate weights for a
     *  synonym.
     *
     *  @param stats	   Source of statistics.
     *  @param query_len_  Query length.
     *  @param factor	   Any scaling factor (e.g. from OP_SCALE_WEIGHT).
     *  @param termfreq    The termfreq to use.
     *  @param reltermfreq The reltermfreq to use.
     *  @param collection_freq The collection frequency to use.
     */
    void init_(const Internal & stats, Xapian::termcount query_len_,
	       double factor, Xapian::doccount termfreq,
	       Xapian::doccount reltermfreq, Xapian::termcount collection_freq);

    /** @private @internal Initialise this object to calculate the extra weight
     *  component.
     *
     *  @param stats	  Source of statistics.
     *  @param query_len_ Query length.
     */
    void init_(const Internal & stats, Xapian::termcount query_len_);

    /** @private @internal Return true if the document length is needed.
     *
     *  If this method returns true, then the document length will be fetched
     *  and passed to @a get_sumpart().  Otherwise 0 may be passed for the
     *  document length.
     */
    bool get_sumpart_needs_doclength_() const {
	return stats_needed & DOC_LENGTH;
    }

    /** @private @internal Return true if the WDF is needed.
     *
     *  If this method returns true, then the WDF will be fetched and passed to
     *  @a get_sumpart().  Otherwise 0 may be passed for the wdf.
     */
    bool get_sumpart_needs_wdf_() const {
	return stats_needed & WDF;
    }

    /** @private @internal Return true if the number of unique terms is needed.
     *
     *  If this method returns true, then the number of unique terms will be
     *  fetched and passed to @a get_sumpart().  Otherwise 0 may be passed for
     *  the number of unique terms.
     */
    bool get_sumpart_needs_uniqueterms_() const {
	return stats_needed & UNIQUE_TERMS;
    }

  protected:
    /** Don't allow copying.
     *
     *  This would ideally be private, but that causes a compilation error
     *  with GCC 4.1 (which appears to be a bug).
     */
    Weight(const Weight &);

    /// The number of documents in the collection.
    Xapian::doccount get_collection_size() const { return collection_size_; }

    /// The number of documents marked as relevant.
    Xapian::doccount get_rset_size() const { return rset_size_; }

    /// The average length of a document in the collection.
    Xapian::doclength get_average_length() const { return average_length_; }

    /// The number of documents which this term indexes.
    Xapian::doccount get_termfreq() const { return termfreq_; }

    /// The number of relevant documents which this term indexes.
    Xapian::doccount get_reltermfreq() const { return reltermfreq_; }

    // The collection frequency of the term.
    Xapian::termcount get_collection_freq() const { return collectionfreq_; }

    /// The length of the query.
    Xapian::termcount get_query_length() const { return query_length_; }

    /// The within-query-frequency of this term.
    Xapian::termcount get_wqf() const { return wqf_; }

    /** An upper bound on the maximum length of any document in the database.
     *
     *  This should only be used by get_maxpart() and get_maxextra().
     */
    Xapian::termcount get_doclength_upper_bound() const {
	return doclength_upper_bound_;
    }

    /** A lower bound on the minimum length of any document in the database.
     *
     *  This bound does not include any zero-length documents.
     *
     *  This should only be used by get_maxpart() and get_maxextra().
     */
    Xapian::termcount get_doclength_lower_bound() const {
	return doclength_lower_bound_;
    }

    /** An upper bound on the wdf of this term.
     *
     *  This should only be used by get_maxpart() and get_maxextra().
     */
    Xapian::termcount get_wdf_upper_bound() const {
	return wdf_upper_bound_;
    }
};

/** Class implementing a "boolean" weighting scheme.
 *
 *  This weighting scheme gives all documents zero weight.
 */
class XAPIAN_VISIBILITY_DEFAULT BoolWeight : public Weight {
    BoolWeight * clone() const;

    void init(double factor);

  public:
    /** Construct a BoolWeight. */
    BoolWeight() { }

    std::string name() const;

    std::string serialise() const;
    BoolWeight * unserialise(const std::string & serialised) const;

    double get_sumpart(Xapian::termcount wdf,
		       Xapian::termcount doclen,
		       Xapian::termcount uniqterms) const;
    double get_maxpart() const;

    double get_sumextra(Xapian::termcount doclen,
			Xapian::termcount uniqterms) const;
    double get_maxextra() const;
};

/// Xapian::Weight subclass implementing the tf-idf weighting scheme.
class XAPIAN_VISIBILITY_DEFAULT TfIdfWeight : public Weight {
    /* Three character string indicating the normalizations for tf(wdf), idf and
       tfidf weight. */
    std::string normalizations;

    /// The factor to multiply with the weight.
    double factor;

    TfIdfWeight * clone() const;

    void init(double factor);

    /* When additional normalizations are implemented in the future, the additional statistics for them
       should be accessed by these functions. */
    double get_wdfn(Xapian::termcount wdf, char c) const;
    double get_idfn(Xapian::doccount termfreq, char c) const;
    double get_wtn(double wt, char c) const;

  public:
    /** Construct a TfIdfWeight
     *
     *  @param normalizations	A three character string indicating the
     *				normalizations to be used for the tf(wdf), idf
     *				and document weight.  (default: "ntn")
     *
     * The @a normalizations string works like so:
     *
     * @li The first character specifies the normalization for the wdf.  The
     *     following normalizations are currently supported:
     *
     *     @li 'n': None.      wdfn=wdf
     *     @li 'b': Boolean    wdfn=1 if term in document else wdfn=0
     *     @li 's': Square     wdfn=wdf*wdf
     *     @li 'l': Logarithmic wdfn=1+log<sub>e</sub>(wdf)
     *
     *     The Max-wdf and Augmented Max wdf normalizations haven't yet been
     *     implemented.
     *
     * @li The second character indicates the normalization for the idf.  The
     *     following normalizations are currently supported:
     *
     *     @li 'n': None   idfn=1
     *     @li 't': TfIdf  idfn=log(N/Termfreq) where N is the number of
     *         documents in collection and Termfreq is the number of documents
     *         which are indexed by the term t.
     *     @li 'p': Prob   idfn=log((N-Termfreq)/Termfreq)
     *
     * @li The third and the final character indicates the normalization for
     *     the document weight.  The following normalizations are currently
     *     supported:
     *
     *     @li 'n': None wtn=tfn*idfn
     *
     * Implementing support for more normalizations of each type would require
     * extending the backend to track more statistics.
     */
    explicit TfIdfWeight(const std::string &normalizations);

    TfIdfWeight()
    : normalizations("ntn")
    {
	need_stat(TERMFREQ);
	need_stat(WDF);
	need_stat(WDF_MAX);
	need_stat(COLLECTION_SIZE);
    }

    std::string name() const;

    std::string serialise() const;
    TfIdfWeight * unserialise(const std::string & serialised) const;

    double get_sumpart(Xapian::termcount wdf,
		       Xapian::termcount doclen,
		       Xapian::termcount uniqterm) const;
    double get_maxpart() const;

    double get_sumextra(Xapian::termcount doclen,
			Xapian::termcount uniqterms) const;
    double get_maxextra() const;
};


/// Xapian::Weight subclass implementing the BM25 probabilistic formula.
class XAPIAN_VISIBILITY_DEFAULT BM25Weight : public Weight {
    /// Factor to multiply the document length by.
    mutable Xapian::doclength len_factor;

    /// Factor combining all the document independent factors.
    mutable double termweight;

    /// The BM25 parameters.
    double param_k1, param_k2, param_k3, param_b;

    /// The minimum normalised document length value.
    Xapian::doclength param_min_normlen;

    BM25Weight * clone() const;

    void init(double factor);

  public:
    /** Construct a BM25Weight.
     *
     *  @param k1  A non-negative parameter controlling how influential
     *		   within-document-frequency (wdf) is.  k1=0 means that
     *		   wdf doesn't affect the weights.  The larger k1 is, the more
     *		   wdf influences the weights.  (default 1)
     *
     *  @param k2  A non-negative parameter which controls the strength of a
     *		   correction factor which depends upon query length and
     *		   normalised document length.  k2=0 disable this factor; larger
     *		   k2 makes it stronger.  (default 0)
     *
     *  @param k3  A non-negative parameter controlling how influential
     *		   within-query-frequency (wqf) is.  k3=0 means that wqf
     *		   doesn't affect the weights.  The larger k3 is, the more
     *		   wqf influences the weights.  (default 1)
     *
     *  @param b   A parameter between 0 and 1, controlling how strong the
     *		   document length normalisation of wdf is.  0 means no
     *		   normalisation; 1 means full normalisation.  (default 0.5)
     *
     *  @param min_normlen  A parameter specifying a minimum value for
     *		   normalised document length.  Normalised document length
     *		   values less than this will be clamped to this value, helping
     *		   to prevent very short documents getting large weights.
     *		   (default 0.5)
     */
    BM25Weight(double k1, double k2, double k3, double b, double min_normlen)
	: param_k1(k1), param_k2(k2), param_k3(k3), param_b(b),
	  param_min_normlen(min_normlen)
    {
	if (param_k1 < 0) param_k1 = 0;
	if (param_k2 < 0) param_k2 = 0;
	if (param_k3 < 0) param_k3 = 0;
	if (param_b < 0) {
	    param_b = 0;
	} else if (param_b > 1) {
	    param_b = 1;
	}
	need_stat(COLLECTION_SIZE);
	need_stat(RSET_SIZE);
	need_stat(TERMFREQ);
	need_stat(RELTERMFREQ);
	need_stat(WDF);
	need_stat(WDF_MAX);
	if (param_k2 != 0 || (param_k1 != 0 && param_b != 0)) {
	    need_stat(DOC_LENGTH_MIN);
	    need_stat(AVERAGE_LENGTH);
	}
	if (param_k1 != 0 && param_b != 0) need_stat(DOC_LENGTH);
	if (param_k2 != 0) need_stat(QUERY_LENGTH);
	if (param_k3 != 0) need_stat(WQF);
    }

    BM25Weight()
	: param_k1(1), param_k2(0), param_k3(1), param_b(0.5),
	  param_min_normlen(0.5)
    {
	need_stat(COLLECTION_SIZE);
	need_stat(RSET_SIZE);
	need_stat(TERMFREQ);
	need_stat(RELTERMFREQ);
	need_stat(WDF);
	need_stat(WDF_MAX);
	need_stat(DOC_LENGTH_MIN);
	need_stat(AVERAGE_LENGTH);
	need_stat(DOC_LENGTH);
	need_stat(WQF);
    }

    std::string name() const;

    std::string serialise() const;
    BM25Weight * unserialise(const std::string & serialised) const;

    double get_sumpart(Xapian::termcount wdf,
		       Xapian::termcount doclen,
		       Xapian::termcount uniqterm) const;
    double get_maxpart() const;

    double get_sumextra(Xapian::termcount doclen,
			Xapian::termcount uniqterms) const;
    double get_maxextra() const;
};

/** Xapian::Weight subclass implementing the traditional probabilistic formula.
 *
 * This class implements the "traditional" Probabilistic Weighting scheme, as
 * described by the early papers on Probabilistic Retrieval.  BM25 generally
 * gives better results.
 *
 * TradWeight(k) is equivalent to BM25Weight(k, 0, 0, 1, 0), except that
 * the latter returns weights (k+1) times larger.
 */
class XAPIAN_VISIBILITY_DEFAULT TradWeight : public Weight {
    /// Factor to multiply the document length by.
    mutable Xapian::doclength len_factor;

    /// Factor combining all the document independent factors.
    mutable double termweight;

    /// The parameter in the formula.
    double param_k;

    TradWeight * clone() const;

    void init(double factor);

  public:
    /** Construct a TradWeight.
     *
     *  @param k  A non-negative parameter controlling how influential
     *		  within-document-frequency (wdf) and document length are.
     *		  k=0 means that wdf and document length don't affect the
     *		  weights.  The larger k is, the more they do.  (default 1)
     */
    explicit TradWeight(double k = 1.0) : param_k(k) {
	if (param_k < 0) param_k = 0;
	if (param_k != 0.0) {
	    need_stat(AVERAGE_LENGTH);
	    need_stat(DOC_LENGTH);
	}
	need_stat(COLLECTION_SIZE);
	need_stat(RSET_SIZE);
	need_stat(TERMFREQ);
	need_stat(RELTERMFREQ);
	need_stat(DOC_LENGTH_MIN);
	need_stat(WDF);
	need_stat(WDF_MAX);
    }

    std::string name() const;

    std::string serialise() const;
    TradWeight * unserialise(const std::string & serialised) const;

    double get_sumpart(Xapian::termcount wdf,
		       Xapian::termcount doclen,
		       Xapian::termcount uniqueterms) const;
    double get_maxpart() const;

    double get_sumextra(Xapian::termcount doclen,
			Xapian::termcount uniqterms) const;
    double get_maxextra() const;
};

/** This class implements the InL2 weighting scheme.
 *
 *  InL2 is a representative scheme of the Divergence from Randomness Framework
 *  by Gianni Amati.
 *
 *  This weighting scheme is useful for tasks that require early precision.
 *
 *  It uses the Inverse document frequency model (In), the Laplace method to
 *  find the aftereffect of sampling (L) and the second wdf normalization
 *  proposed by Amati to normalize the wdf in the document to the length of the
 *  document (H2).
 *
 *  For more information about the DFR Framework and the InL2 scheme, please
 *  refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
 *  models of information retrieval based on measuring the divergence from
 *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
 *  pp. 357-389.
 */
class XAPIAN_VISIBILITY_DEFAULT InL2Weight : public Weight {
    /// The wdf normalization parameter in the formula.
    double param_c;

    /// The upper bound on the weight a term can give to a document.
    double upper_bound;

    /// The constant values which are used on every call to get_sumpart().
    double wqf_product_idf;
    double c_product_avlen;

    InL2Weight * clone() const;

    void init(double factor);

  public:
    /** Construct an InL2Weight.
     *
     *  @param c  A non-negative and non zero parameter controlling the extent
     *		  of the normalization of the wdf to the document length. The
     *		  default value of 1 is suitable for longer queries but it may
     *		  need to be changed for shorter queries. For more information,
     *		  please refer to Gianni Amati's PHD thesis.
     */
    explicit InL2Weight(double c);

    InL2Weight()
    : param_c(1.0)
    {
	need_stat(AVERAGE_LENGTH);
	need_stat(DOC_LENGTH);
	need_stat(DOC_LENGTH_MIN);
	need_stat(DOC_LENGTH_MAX);
	need_stat(COLLECTION_SIZE);
	need_stat(WDF);
	need_stat(WDF_MAX);
	need_stat(WQF);
	need_stat(TERMFREQ);
    }

    std::string name() const;

    std::string serialise() const;
    InL2Weight * unserialise(const std::string & serialised) const;

    double get_sumpart(Xapian::termcount wdf,
		       Xapian::termcount doclen,
		       Xapian::termcount uniqterms) const;
    double get_maxpart() const;

    double get_sumextra(Xapian::termcount doclen,
			Xapian::termcount uniqterms) const;
    double get_maxextra() const;
};

/** This class implements the IfB2 weighting scheme.
 *
 *  IfB2 is a representative scheme of the Divergence from Randomness Framework
 *  by Gianni Amati.
 *
 *  It uses the Inverse term frequency model (If), the Bernoulli method to find
 *  the aftereffect of sampling (B) and the second wdf normalization proposed
 *  by Amati to normalize the wdf in the document to the length of the document
 *  (H2).
 *
 *  For more information about the DFR Framework and the IfB2 scheme, please
 *  refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
 *  models of information retrieval based on measuring the divergence from
 *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
 *  pp. 357-389.
 */
class XAPIAN_VISIBILITY_DEFAULT IfB2Weight : public Weight {
    /// The wdf normalization parameter in the formula.
    double param_c;

    /// The upper bound on the weight.
    double upper_bound;

    /// The constant values which are used for calculations in get_sumpart().
    double wqf_product_idf;
    double c_product_avlen;
    double B_constant;

    IfB2Weight * clone() const;

    void init(double factor);

  public:
    /** Construct an IfB2Weight.
     *
     *  @param c  A non-negative and non zero parameter controlling the extent
     *		  of the normalization of the wdf to the document length. The
     *		  default value of 1 is suitable for longer queries but it may
     *		  need to be changed for shorter queries. For more information,
     *		  please refer to Gianni Amati's PHD thesis titled
     *		  Probabilistic Models for Information Retrieval based on
     *		  Divergence from Randomness.
     */
    explicit IfB2Weight(double c);

    IfB2Weight( ) : param_c(1.0) {
	need_stat(AVERAGE_LENGTH);
	need_stat(DOC_LENGTH);
	need_stat(DOC_LENGTH_MIN);
	need_stat(DOC_LENGTH_MAX);
	need_stat(COLLECTION_SIZE);
	need_stat(COLLECTION_FREQ);
	need_stat(WDF);
	need_stat(WDF_MAX);
	need_stat(WQF);
	need_stat(TERMFREQ);
    }

    std::string name() const;

    std::string serialise() const;
    IfB2Weight * unserialise(const std::string & serialised) const;

    double get_sumpart(Xapian::termcount wdf,
		       Xapian::termcount doclen,
		       Xapian::termcount uniqterm) const;
    double get_maxpart() const;

    double get_sumextra(Xapian::termcount doclen,
			Xapian::termcount uniqterms) const;
    double get_maxextra() const;
};

/** This class implements the IneB2 weighting scheme.
 *
 *  IneB2 is a representative scheme of the Divergence from Randomness
 *  Framework by Gianni Amati.
 *
 *  It uses the Inverse expected document frequency model (Ine), the Bernoulli
 *  method to find the aftereffect of sampling (B) and the second wdf
 *  normalization proposed by Amati to normalize the wdf in the document to the
 *  length of the document (H2).
 *
 *  For more information about the DFR Framework and the IneB2 scheme, please
 *  refer to: Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
 *  models of information retrieval based on measuring the divergence from
 *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
 *  pp. 357-389.
 */
class XAPIAN_VISIBILITY_DEFAULT IneB2Weight : public Weight {
    /// The wdf normalization parameter in the formula.
    double param_c;

    /// The upper bound of the weight.
    double upper_bound;

    /// Constant values used in get_sumpart().
    double wqf_product_idf;
    double c_product_avlen;
    double B_constant;

    IneB2Weight * clone() const;

    void init(double factor);

  public:
    /** Construct an IneB2Weight.
     *
     *  @param c  A non-negative and non zero parameter controlling the extent
     *		  of the normalization of the wdf to the document length. The
     *		  default value of 1 is suitable for longer queries but it may
     *		  need to be changed for shorter queries. For more information,
     *		  please refer to Gianni Amati's PHD thesis.
     */
    explicit IneB2Weight(double c);

    IneB2Weight( ) : param_c(1.0) {
	need_stat(AVERAGE_LENGTH);
	need_stat(DOC_LENGTH);
	need_stat(DOC_LENGTH_MIN);
	need_stat(DOC_LENGTH_MAX);
	need_stat(COLLECTION_SIZE);
	need_stat(WDF);
	need_stat(WDF_MAX);
	need_stat(WQF);
	need_stat(COLLECTION_FREQ);
	need_stat(TERMFREQ);
    }

    std::string name() const;

    std::string serialise() const;
    IneB2Weight * unserialise(const std::string & serialised) const;

    double get_sumpart(Xapian::termcount wdf,
		       Xapian::termcount doclen,
		       Xapian::termcount uniqterms) const;
    double get_maxpart() const;

    double get_sumextra(Xapian::termcount doclen,
			Xapian::termcount uniqterms) const;
    double get_maxextra() const;
};

/** This class implements the BB2 weighting scheme.
 *
 *  BB2 is a representative scheme of the Divergence from Randomness Framework
 *  by Gianni Amati.
 *
 *  It uses the Bose-Einstein probabilistic distribution (B) along with
 *  Stirling's power approximation, the Bernoulli method to find the
 *  aftereffect of sampling (B) and the second wdf normalization proposed by
 *  Amati to normalize the wdf in the document to the length of the document
 *  (H2).
 *
 *  For more information about the DFR Framework and the BB2 scheme, please
 *  refer to : Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
 *  models of information retrieval based on measuring the divergence from
 *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002,
 *  pp. 357-389.
 */
class XAPIAN_VISIBILITY_DEFAULT BB2Weight : public Weight {
    /// The wdf normalization parameter in the formula.
    double param_c;

    /// The upper bound on the weight.
    double upper_bound;

    /// The constant values to be used in get_sumpart().
    double c_product_avlen;
    double B_constant;
    double wt;
    double stirling_constant_1;
    double stirling_constant_2;

    BB2Weight * clone() const;

    void init(double factor);

  public:
    /** Construct a BB2Weight.
     *
     *  @param c  A non-negative and non zero parameter controlling the extent
     *		  of the normalization of the wdf to the document length. A
     *		  default value of 1 is suitable for longer queries but it may
     *		  need to be changed for shorter queries. For more information,
     *		  please refer to Gianni Amati's PHD thesis titled
     *		  Probabilistic Models for Information Retrieval based on
     *		  Divergence from Randomness.
     */
    explicit BB2Weight(double c);

    BB2Weight( ) : param_c(1.0) {
	need_stat(AVERAGE_LENGTH);
	need_stat(DOC_LENGTH);
	need_stat(DOC_LENGTH_MIN);
	need_stat(DOC_LENGTH_MAX);
	need_stat(COLLECTION_SIZE);
	need_stat(COLLECTION_FREQ);
	need_stat(WDF);
	need_stat(WDF_MAX);
	need_stat(WQF);
	need_stat(TERMFREQ);
    }

    std::string name() const;

    std::string serialise() const;
    BB2Weight * unserialise(const std::string & serialised) const;

    double get_sumpart(Xapian::termcount wdf,
		       Xapian::termcount doclen,
		       Xapian::termcount uniqterms) const;
    double get_maxpart() const;

    double get_sumextra(Xapian::termcount doclen,
			Xapian::termcount uniqterms) const;
    double get_maxextra() const;
};

/** This class implements the DLH weighting scheme, which is a representative
 *  scheme of the Divergence from Randomness Framework by Gianni Amati.
 *
 *  This is a parameter free weighting scheme and it should be used with query
 *  expansion to obtain better results. It uses the HyperGeometric Probabilistic
 *  model and Laplace's normalization to calculate the risk gain.
 *
 *  For more information about the DFR Framework and the DLH scheme, please
 *  refer to :
 *  a.) Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic
 *  models of information retrieval based on measuring the divergence from
 *  randomness ACM Transactions on Information Systems (TOIS) 20, (4), 2002, pp.
 *  357-389.
 *  b.) FUB, IASI-CNR and University of Tor Vergata at TREC 2007 Blog Track.
 *  G. Amati and E. Ambrosi and M. Bianchi and C. Gaibisso and G. Gambosi.
 *  Proceedings of the 16th Text REtrieval Conference (TREC-2007), 2008.
 */
class XAPIAN_VISIBILITY_DEFAULT DLHWeight : public Weight {
    /// The lower bound on the weight.
    double lower_bound;

    /// The upper bound on the weight.
    double upper_bound;

    /// The constant value to be used in get_sumpart().
    double log_constant;
    double wqf_product_factor;

    DLHWeight * clone() const;

    void init(double factor);

  public:
    DLHWeight() {
	need_stat(AVERAGE_LENGTH);
	need_stat(DOC_LENGTH);
	need_stat(COLLECTION_SIZE);
	need_stat(COLLECTION_FREQ);
	need_stat(WDF);
	need_stat(WQF);
	need_stat(WDF_MAX);
	need_stat(DOC_LENGTH_MIN);
	need_stat(DOC_LENGTH_MAX);
    }

    std::string name() const;

    std::string serialise() const;
    DLHWeight * unserialise(const std::string & serialised) const;

    double get_sumpart(Xapian::termcount wdf,
		       Xapian::termcount doclen,
		       Xapian::termcount uniqterms) const;
    double get_maxpart() const;

    double get_sumextra(Xapian::termcount doclen,
			Xapian::termcount uniqterms) const;
    double get_maxextra() const;
};

/** This class implements the PL2 weighting scheme.
 *
 *  PL2 is a representative scheme of the Divergence from Randomness Framework
 *  by Gianni Amati.
 *
 *  This weighting scheme is useful for tasks that require early precision.
 *
 *  It uses the Poisson approximation of the Binomial Probabilistic distribution
 *  (P) along with Stirling's approximation for the factorial value, the Laplace
 *  method to find the aftereffect of sampling (L) and the second wdf
 *  normalization proposed by Amati to normalize the wdf in the document to the
 *  length of the document (H2).
 *
 *  For more information about the DFR Framework and the PL2 scheme, please
 *  refer to : Gianni Amati and Cornelis Joost Van Rijsbergen Probabilistic models
 *  of information retrieval based on measuring the divergence from randomness
 *  ACM Transactions on Information Systems (TOIS) 20, (4), 2002, pp. 357-389.
 */
class XAPIAN_VISIBILITY_DEFAULT PL2Weight : public Weight {
    /// The wdf normalization parameter in the formula.
    double param_c;

    /// The lower bound of the weight.
    double lower_bound;

    /// The upper bound on the weight.
    double upper_bound;

    /// Constants for a given term in a given query.
    double P1, P2;

    /// Set by init() to (param_c * get_average_length())
    double cl;

    PL2Weight * clone() const;

    void init(double factor);

  public:
    /** Construct a PL2Weight.
     *
     *  @param c  A non-negative and non zero parameter controlling the extent
     *		  of the normalization of the wdf to the document length. The
     *		  default value of 1 is suitable for longer queries but it may
     *		  need to be changed for shorter queries. For more information,
     *		  please refer to Gianni Amati's PHD thesis titled
     *		  Probabilistic Models for Information Retrieval based on
     *		  Divergence from Randomness.
     */
    explicit PL2Weight(double c);

    PL2Weight( ) : param_c(1.0) {
	need_stat(AVERAGE_LENGTH);
	need_stat(DOC_LENGTH);
	need_stat(DOC_LENGTH_MIN);
	need_stat(DOC_LENGTH_MAX);
	need_stat(COLLECTION_SIZE);
	need_stat(COLLECTION_FREQ);
	need_stat(WDF);
	need_stat(WDF_MAX);
	need_stat(WQF);
    }

    std::string name() const;

    std::string serialise() const;
    PL2Weight * unserialise(const std::string & serialised) const;

    double get_sumpart(Xapian::termcount wdf,
		       Xapian::termcount doclen,
		       Xapian::termcount uniqterms) const;
    double get_maxpart() const;

    double get_sumextra(Xapian::termcount doclen,
			Xapian::termcount uniqterms) const;
    double get_maxextra() const;
};

/** This class implements the DPH weighting scheme.
 *
 *  DPH is a representative scheme of the Divergence from Randomness Framework
 *  by Gianni Amati.
 *
 *  This is a parameter free weighting scheme and it should be used with query
 *  expansion to obtain better results. It uses the HyperGeometric Probabilistic
 *  model and Popper's normalization to calculate the risk gain.
 *
 *  For more information about the DFR Framework and the DPH scheme, please
 *  refer to :
 *  a.) Gianni Amati and Cornelis Joost Van Rijsbergen
 *  Probabilistic models of information retrieval based on measuring the
 *  divergence from randomness ACM Transactions on Information Systems (TOIS) 20,
 *  (4), 2002, pp. 357-389.
 *  b.) FUB, IASI-CNR and University of Tor Vergata at TREC 2007 Blog Track.
 *  G. Amati and E. Ambrosi and M. Bianchi and C. Gaibisso and G. Gambosi.
 *  Proceedings of the 16th Text Retrieval Conference (TREC-2007), 2008.
 */
class XAPIAN_VISIBILITY_DEFAULT DPHWeight : public Weight {
    /// The upper bound on the weight.
    double upper_bound;

    /// The lower bound on the weight.
    double lower_bound;

    /// The constant value used in get_sumpart() .
    double log_constant;
    double wqf_product_factor;

    DPHWeight * clone() const;

    void init(double factor);

  public:
    /** Construct a DPHWeight. */
    DPHWeight() {
	need_stat(AVERAGE_LENGTH);
	need_stat(DOC_LENGTH);
	need_stat(COLLECTION_SIZE);
	need_stat(COLLECTION_FREQ);
	need_stat(WDF);
	need_stat(WQF);
	need_stat(WDF_MAX);
	need_stat(DOC_LENGTH_MIN);
	need_stat(DOC_LENGTH_MAX);
    }

    std::string name() const;

    std::string serialise() const;
    DPHWeight * unserialise(const std::string & serialised) const;

    double get_sumpart(Xapian::termcount wdf,
		       Xapian::termcount doclen,
		       Xapian::termcount uniqterms) const;
    double get_maxpart() const;

    double get_sumextra(Xapian::termcount doclen,
			Xapian::termcount uniqterms) const;
    double get_maxextra() const;
};


/** Xapian::Weight subclass implementing the Language Model formula.
 *
 * This class implements the "Language Model" Weighting scheme, as
 * described by the early papers on LM by Bruce Croft.
 *
 * LM works by comparing the query to a Language Model of the document.
 * The language model itself is parameter-free, though LMWeight takes
 * parameters which specify the smoothing used.
 */
class XAPIAN_VISIBILITY_DEFAULT LMWeight : public Weight {

    /** The type of smoothing to use. */
    type_smoothing select_smoothing;

    // Parameters for handling negative value of log, and for smoothing.
    double param_log, param_smoothing1, param_smoothing2;

    // Collection weight.
    double weight_collection;

    LMWeight * clone() const;

    void init(double factor);

  public:
    /** Construct a LMWeight.
     *
     *  @param param_log_	A non-negative parameter controlling how much
     *				to clamp negative values returned by the log.
     *				The log is calculated by multiplying the
     *				actual weight by param_log.  If param_log is
     *				0.0, then the document length upper bound will
     *				be used (default: document length upper	bound)
     *
     *  @param select_smoothing_	A parameter of type enum
     *					type_smoothing.  This parameter
     *					controls which smoothing type to use.
     *					(default: TWO_STAGE_SMOOTHING)
     *
     *  @param param_smoothing1_	A non-negative parameter for smoothing
     *					whose meaning depends on
     *					select_smoothing_.  In
     *					JELINEK_MERCER_SMOOTHING, it plays the
     *					role of estimation and in
     *					DIRICHLET_SMOOTHING the role of query
     *					modelling. (default JELINEK_MERCER,
     *					ABSOLUTE, TWOSTAGE(0.7),
     *					DIRCHLET(2000))
     *
     *  @param param_smoothing2_	A non-negative parameter which is used
     *					only with TWO_STAGE_SMOOTHING as
     *					parameter for Dirichlet's smoothing.
     *					(default: 2000)
     */
    // Unigram LM Constructor to specifically mention all parameters for handling negative log value and smoothing.
    explicit LMWeight(double param_log_ = 0.0,
		      type_smoothing select_smoothing_ = TWO_STAGE_SMOOTHING,
		      double param_smoothing1_ = 0.7,
		      double param_smoothing2_ = 2000.0)
	: select_smoothing(select_smoothing_), param_log(param_log_), param_smoothing1(param_smoothing1_),
	  param_smoothing2(param_smoothing2_)
    {
	need_stat(AVERAGE_LENGTH);
	need_stat(DOC_LENGTH);
	need_stat(COLLECTION_SIZE);
	need_stat(RSET_SIZE);
	need_stat(TERMFREQ);
	need_stat(RELTERMFREQ);
	need_stat(DOC_LENGTH_MAX);
	need_stat(WDF);
	need_stat(WDF_MAX);
	need_stat(COLLECTION_FREQ);
	if (select_smoothing == ABSOLUTE_DISCOUNT_SMOOTHING)
	    need_stat(UNIQUE_TERMS);
    }

    std::string name() const;

    std::string serialise() const;
    LMWeight * unserialise(const std::string & s) const;

    double get_sumpart(Xapian::termcount wdf,
		       Xapian::termcount doclen,
		       Xapian::termcount uniqterm) const;
    double get_maxpart() const;

    double get_sumextra(Xapian::termcount doclen, Xapian::termcount) const;
    double get_maxextra() const;
};

}

#endif // XAPIAN_INCLUDED_WEIGHT_H
libxapian-1.3-dev 1.3.4-0ubuntu6 / usr / include / xapian-1.3 / xapian / weight.h