Michael Falconer

the difficult takes time, the impossible just a little longer

My Links

Blog Stats

News

Michael Falconer is a freelance/contract .net developer working in and around Glasgow, Scotland, mainly on asp.net applications. His company is called, for some strange reason, Camel-Jones.

Archives

Post Categories

Blogs

Other Links

Regulars

Wednesday, November 10, 2004 #

.Net helper class for SQL Server Full Text Search

I've made use of SQL Servers ability to run a full text search in a couple of projects recently and wrote a little helper class that takes a typical search phrase and creates the tsql statement that you can then pass into a CONTAINSTABLE clause. Basically it looks either for words separated by a space, or a phrase with quotes around it, or a mixture of both.

There is one limitation in this implementation; the noise words are hard coded into a string array. It would be better to read in the contents of the noise word file that SQL Server uses, but in some shared hosting arrangements you may not have access to the file.

    1 using System;
    2 using System.Collections;
    3  
    4 namespace CamelJones.Data.Utils
    5 {
    6   /// <summary>
    7   /// Helper object for converting a search string into a
    8   /// Full Text Search query, used in a CONTAINSTABLE clause.
    9   /// </summary>
   10   public class SqlFtsHelper
   11   {
   12     #region Private Members
   13  
   14     private string _searchPhrase;
   15     private string _ftsQuery;
   16     private ArrayList _ignoredWords = new ArrayList();
   17     private string[] _noiseWords = new string[] {"about","after","all","also","an","and","another","any","are","as","at","be","because","been","before","being","between","both","but","by","came","can","come","could","did","do","each","for","from","get","got","has","had","he","have","her","here","him","himself","his","how","if","in","into","is","it","like","make","many","me","might","more","most","much","must","my","never","now","of","on","only","or","other","our","out","over","said","same","see","should","since","some","still","such","take","than","that","the","their","them","then","there","these","they","this","those","through","to","too","under","up","very","was","way","we","well","were","what","where","which","while","who","with","would","you","your"};
   18  
   19     #endregion
   20  
   21     #region Constructors
   22  
   23     /// <summary>
   24     /// Constructor for the SqlFtsHelper object
   25     /// </summary>
   26     /// <param name="query">The search string that is to be used to create the Full Text Search query</param>
   27     public SqlFtsHelper(string searchPhrase)
   28     {
   29       _searchPhrase = searchPhrase;
   30       BuildFTSQuery();
   31     }
   32  
   33     #endregion
   34  
   35     #region Public Methods
   36  
   37     /// <summary>
   38     /// The query string that was passed in to be used for the Full Text Search
   39     /// </summary>
   40     public string SearchPhrase
   41     {
   42       get { return _searchPhrase; }
   43     }
   44  
   45     /// <summary>
   46     /// The T-SQL statement to be used as the contains_search_condition parameter
   47     /// of a CONTAINSTABLE query
   48     /// </summary>
   49     public string FTSQuery
   50     {
   51       get { return _ftsQuery; }
   52     }
   53  
   54     /// <summary>
   55     /// An array of any noise words that were included in the search string
   56     /// </summary>
   57     public object[] IgnoredWords
   58     {
   59       get { return _ignoredWords.ToArray(); }
   60     }
   61  
   62     #endregion
   63  
   64     #region Private Methods
   65  
   66     /// <summary>
   67     /// Builds the Full Text Search query
   68     /// </summary>
   69     private void BuildFTSQuery()
   70     {
   71       _ftsQuery = "";
   72       string cleanQuery = _searchPhrase;
   73  
   74       // Strip out comments, wildcards
   75       cleanQuery = cleanQuery.Replace("--","");
   76       cleanQuery = cleanQuery.Replace("*","");
   77       cleanQuery = cleanQuery.Replace("%","");
   78  
   79       // Rather than simply split the string, walk
   80       // through it looking for phrases or individual
   81       // words
   82       cleanQuery = cleanQuery.Trim();
   83       while(cleanQuery.Length>0)
   84       {
   85         // If this is the only word
   86         if(cleanQuery.IndexOf('"')==-1 && cleanQuery.IndexOf(' ')==-1)
   87         {
   88           if(!CheckNoiseWord(cleanQuery))
   89           {
   90             FTSAddWord(cleanQuery);
   91           }
   92           break;
   93         }
   94  
   95         // Get the next word/phrase
   96         if(!cleanQuery.StartsWith("\""))
   97         {
   98           // Get the word from the search string
   99           int end = cleanQuery.IndexOf(" ");
  100           if(end<=0){end = cleanQuery.Length;}
  101           string theWord = cleanQuery.Substring(0,end);
  102  
  103           // Check it's not a noise word
  104           if(!CheckNoiseWord(theWord))
  105           {
  106             FTSAddWord(theWord);
  107           }
  108         
  109           // Remove the word from the search string
  110           cleanQuery = cleanQuery.Remove(0,end);
  111           cleanQuery = cleanQuery.Trim();
  112           continue;
  113         }
  114         else
  115         {
  116           // A phrase
  117           int end = cleanQuery.IndexOf('"',1) + 1;
  118           if(end<=1){end = cleanQuery.Length;}
  119           FTSAddPhrase(cleanQuery.Substring(0,end));
  120           cleanQuery = cleanQuery.Remove(0,end);
  121           cleanQuery = cleanQuery.Trim();
  122           continue;
  123         }
  124       }
  125     }
  126  
  127     /// <summary>
  128     /// Checks a string for noise words and, if any exist,
  129     /// removes them from the string and adds them to the
  130     /// array of ignored words.
  131     /// </summary>
  132     /// <param name="word">the word to check</param>
  133     /// <returns>true if the word is a noise word, otherwise false</returns>
  134     private bool CheckNoiseWord(string word)
  135     {
  136       if(IsNoiseWord(word))
  137       {
  138         _ignoredWords.Add(word);
  139         return true;
  140       }
  141       return false;
  142     }
  143  
  144     /// <summary>
  145     /// Add an individual word to the query
  146     /// </summary>
  147     /// <param name="word">the word to be added</param>
  148     private void FTSAddWord(string word)
  149     {
  150       if(_ftsQuery.Length>0)
  151       {
  152         _ftsQuery = _ftsQuery + " AND ";
  153       }
  154       _ftsQuery = _ftsQuery + "FORMSOF(INFLECTIONAL, " + word + ")";
  155     }
  156  
  157     /// <summary>
  158     /// Add a phrase to the FTS query
  159     /// </summary>
  160     /// <param name="phrase">the phrase to be added</param>
  161     private void FTSAddPhrase(string phrase)
  162     {
  163       if(_ftsQuery.Length>0)
  164       {
  165         _ftsQuery = _ftsQuery + " AND ";
  166       }
  167       _ftsQuery = _ftsQuery + phrase;
  168     }
  169  
  170     /// <summary>
  171     /// Checks if the word is a noise word
  172     /// </summary>
  173     /// <param name="word">the word to check</param>
  174     /// <returns>true if it is a noise word, otherwise false</returns>
  175     private bool IsNoiseWord(string word)
  176     {
  177       // Exclude individual characters
  178       if(word.Length>1)
  179       {
  180         // Iterate through the string array of noise words,
  181         // and check the word parameter does not match
  182         foreach(string item in _noiseWords)
  183         {
  184           if(word==item)
  185           {
  186             return true;
  187           }
  188         }
  189  
  190         return false;
  191       }
  192  
  193       return true;
  194     }
  195  
  196     #endregion
  197   }
  198 }

P.S. The code snippet above was formatted using the CopySourceAsHtml VS addin by Colin Coller.

posted @ 9:42 AM