How to use Microsoft Web N-gram service

From Data-gov Wiki

Jump to: navigation, search
Infobox (How-To) edit with form
  • name: How to use Microsoft Web N-gram service

  • description: this tutorial shows you how to use Microsoft Web N-gram service to build something interesting with example source code.
  • creator(s): Li Ding
  • created: April 30, 2010
  • modified: 2010-4-30


How to build a Multi-word TagCloud

1. visit their home page for more information,

  http://research.microsoft.com/web-ngram

2. check out their beta test service. You should be able to get a page "Microsoft Web N-Gram Service Quick Start" after agreeing the term of use.

  http://web-ngram.research.microsoft.com/info/

3. Install "Visual Studio Express" (I started with C# following their default step-by-step instructions. I will do more research on accessing their web service using other languages). It took me a while to get everything installed.

  http://www.microsoft.com/express/Windows/

4. Now follow their step-by-step instructions to build my first app. The instructions are great and helpful. I also did a little bit modifications to create shortcut: I skipped the step "Modify the Project Configuration" and put the "userToken" and "ngramModel" parameters in source code.

5. write the code!

using System;
using System.Collections.Generic;
using System.IO;
using System.Text.RegularExpressions;
using System.Linq;
using System.Text;

namespace NGramTest
{
    class Program
    {
        static void Main(string[] args)
        {
            SegmentTextFromConsole();
        }

        //experiment 1 console level text segmentation
        static void SegmentTextFromConsole()
        {
            String line = null;
            Console.WriteLine("Please type a phrases");
            while ((line = Console.ReadLine()) != "")
            {
                Program p = new Program();
                p.DoSegment(line);
                p.printSegmentResult();

                Console.WriteLine("Please type a phrases");
            }
        }

        ///////////////////////////////////////
        // local code
        string userToken = "<YOUR_USER_TOKEN>";
        string ngramModel = "urn:ngram:bing-title:jun09:4";

        string[] result_phrases = null;
        float[] result_phrases_probability = null;
        bool[] result_phrases_ismulti = null;

        public void printSegmentResult()
        {
            Console.WriteLine("-------Segmentation Result----------------");
            for (int i = 0; i < result_phrases.Length; i++)
            {
                if (null == result_phrases[i])
                    break;

                Console.Write("[phrase {0}]: ", i);
                Console.Write(result_phrases_ismulti[i]);
                Console.Write(" log(p)={0} \t", result_phrases_probability[i]);
                Console.Write(result_phrases[i]);
                Console.WriteLine();
            }
        }

        public void DoSegment(string line)
        { 
            //look up token
            NGramService.LookupServiceClient client = new NGramTest.NGramService.LookupServiceClient();

            //remove white space
            line = line.Trim();

            //pre-segnmentation
            string[] args = line.Split(' ');
            result_phrases = new string[args.Length];
            result_phrases_probability = new float[args.Length];
            result_phrases_ismulti = new bool[args.Length];

            int index_result_phrases = 0;
            for (int i = 0; i < args.Length ; i++)
            {
                //init
                result_phrases[i] = null;
                Console.WriteLine("++++++");

                //get probabilityof the previous phrase
                if (index_result_phrases > 0 && result_phrases[index_result_phrases].Length>0)
                {
                    Console.WriteLine("phrase A: " + result_phrases[index_result_phrases]);
                    Console.WriteLine("    log(p(A))= " + result_phrases_probability[index_result_phrases]);
                }

                //get probability of the current word
                float probability = client.GetProbability(userToken, ngramModel, args[i]);
                Console.WriteLine("phrase B: " + args[i]);
                Console.WriteLine("    log(p(B))= " + probability);                       

                if (i == 0)
                {
                    // add the first word to phrase
                    result_phrases[index_result_phrases] = args[i];
                    result_phrases_probability[index_result_phrases] = probability;
                    result_phrases_ismulti[index_result_phrases] = false;
                }
                else
                {
                    // joint the previous phrase with the current word
                    String jointPhrase = result_phrases[index_result_phrases]+ " "+args[i];
                    float jointp = client.GetProbability(userToken, ngramModel, jointPhrase);
                    Console.WriteLine("phrase AB: " + jointPhrase);
                    Console.WriteLine("    log(p(AB))= " + jointp);
                    Console.WriteLine("    log(p(A)*p(B))= " + (result_phrases_probability[index_result_phrases] + probability));

                    //compute a couple of heuristic features 
                    bool bCanJoin = (result_phrases_probability[index_result_phrases] + probability) < jointp;
                    bool bHasSignificantProbabilityDifference =  Math.Abs(result_phrases_probability[index_result_phrases] - probability) > 4;
                    bool bIsJoinSignificantEnough = Math.Abs((result_phrases_probability[index_result_phrases] + probability) - jointp) > 1;
                    bool bStopWord1 = (result_phrases_probability[index_result_phrases] > -2);
                    bool bStopWord2 = (probability > -2);

                    if (bCanJoin && !bHasSignificantProbabilityDifference && bIsJoinSignificantEnough && !bStopWord1)
                    {
                        //join
                        result_phrases[index_result_phrases] = jointPhrase;
                        result_phrases_probability[index_result_phrases] = jointp;
                        result_phrases_ismulti[index_result_phrases] = true;
                    }
                    else
                    {
                        //separate
                        index_result_phrases++;
                        
                        result_phrases[index_result_phrases] = args[i];
                        result_phrases_probability[index_result_phrases] = probability;
                        result_phrases_ismulti[index_result_phrases] = false;
                    }
                 }
                }
            }
    }
}
Facts about How to use Microsoft Web N-gram serviceRDF feed
Dcterms:created30 April 2010  +
Dcterms:creatorLi Ding  +
Dcterms:descriptionthis tutorial shows you how to use Microsoft Web N-gram service to build something interesting with example source code.
Dcterms:modified2010-4-30
Foaf:nameHow to use Microsoft Web N-gram service
Skos:altLabelHow to use Microsoft Web N-gram service  +, how to use microsoft web n-gram service  +, and HOW TO USE MICROSOFT WEB N-GRAM SERVICE  +
Personal tools
internal pages