import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */
/* @jsx mdx */
import DefaultLayout from "/home/runner/work/myedibleenso.github.io/myedibleenso.github.io/src/components/BasicLayout.js";
export const _frontmatter = {};
const layoutProps = {
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">


    <h1 {...{
      "id": "overview",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#overview",
        "aria-label": "overview permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Overview`}</h1>
    <p>{`In this tutorial, we're going to step through training a naïve Bayes classifier with `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-gram features using a popular machine learning library in Python called `}<a parentName="p" {...{
        "href": "https://scikit-learn.org/stable/index.html",
        "target": "_self",
        "rel": "nofollow"
      }}><code parentName="a" {...{
          "className": "language-text"
        }}>{`scikit-learn`}</code></a>{`.  This library provides implementations and an easy-to-use interface for many supervised and unsupervised`}<sup parentName="p" {...{
        "id": "fnref-1"
      }}><a parentName="sup" {...{
          "href": "#fn-1",
          "className": "footnote-ref"
        }}>{`1`}</a></sup>{` classifiers, as well as utilities for ...`}</p>
    <ul>
      <li parentName="ul">{`data preprocessing`}</li>
      <li parentName="ul">{`feature extraction`}</li>
      <li parentName="ul">{`feature selection`}</li>
      <li parentName="ul">{`dimensionality reduction`}</li>
      <li parentName="ul">{`classifier evaluation`}</li>
    </ul>
    <p>{`... among other things!`}</p>
    <h1 {...{
      "id": "outcomes",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#outcomes",
        "aria-label": "outcomes permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Outcomes`}</h1>
    <p>{`After completing this lesson, you'll be able to ...`}</p>
    <ul>
      <li parentName="ul">
        <p parentName="li">{`use the `}<a parentName="p" {...{
            "href": "https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html",
            "target": "_self",
            "rel": "nofollow"
          }}><code parentName="a" {...{
              "className": "language-text"
            }}>{`CountVectorizer`}</code></a>{` to generate `}<span parentName="p" {...{
            "className": "math math-inline"
          }}><span parentName="span" {...{
              "className": "katex"
            }}><span parentName="span" {...{
                "className": "katex-mathml"
              }}><math parentName="span" {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML"
                }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                      "encoding": "application/x-tex"
                    }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
                "className": "katex-html",
                "aria-hidden": "true"
              }}><span parentName="span" {...{
                  "className": "base"
                }}><span parentName="span" {...{
                    "className": "strut",
                    "style": {
                      "height": "0.4306em"
                    }
                  }}></span><span parentName="span" {...{
                    "className": "mord mathnormal"
                  }}>{`n`}</span></span></span></span></span>{`-grams counts for documents`}</p>
      </li>
      <li parentName="ul">
        <p parentName="li">{`use the `}<a parentName="p" {...{
            "href": "https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html",
            "target": "_self",
            "rel": "nofollow"
          }}><code parentName="a" {...{
              "className": "language-text"
            }}>{`LabelEncoder`}</code></a>{` to string representations of class labels to integers`}</p>
      </li>
      <li parentName="ul">
        <p parentName="li">{`train a `}<a parentName="p" {...{
            "href": "https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html",
            "target": "_self",
            "rel": "nofollow"
          }}>{`naïve Bayes classifier`}</a>{` and use it to make predictions`}</p>
      </li>
    </ul>
    <h1 {...{
      "id": "prerequisites",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#prerequisites",
        "aria-label": "prerequisites permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Prerequisites`}</h1>
    <p>{`Before starting this tutorial, you should be comfortable with ...`}</p>
    <ul>
      <li parentName="ul"><a parentName="li" {...{
          "href": "/tutorials/regex-tokenizer"
        }}>{`tokenization`}</a></li>
      <li parentName="ul"><a parentName="li" {...{
          "href": "/tutorials/n-grams"
        }}><span parentName="a" {...{
            "className": "math math-inline"
          }}><span parentName="span" {...{
              "className": "katex"
            }}><span parentName="span" {...{
                "className": "katex-mathml"
              }}><math parentName="span" {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML"
                }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                      "encoding": "application/x-tex"
                    }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
                "className": "katex-html",
                "aria-hidden": "true"
              }}><span parentName="span" {...{
                  "className": "base"
                }}><span parentName="span" {...{
                    "className": "strut",
                    "style": {
                      "height": "0.4306em"
                    }
                  }}></span><span parentName="span" {...{
                    "className": "mord mathnormal"
                  }}>{`n`}</span></span></span></span></span>{`-grams`}</a></li>
      <li parentName="ul"><a parentName="li" {...{
          "href": "/tutorials/text-normalization"
        }}>{`text normalization`}</a></li>
      <li parentName="ul"><a parentName="li" {...{
          "href": "/tutorials/docker-basics"
        }}>{`docker`}</a></li>
    </ul>
    <p>{`You can follow along and run these snippets interactively in the `}<a parentName="p" {...{
        "href": "https://ipython.readthedocs.io/en/stable/interactive/tutorial.html",
        "target": "_self",
        "rel": "nofollow"
      }}>{`IPython REPL`}</a>{` using following docker image:`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "bash"
    }}><pre parentName="div" {...{
        "className": "language-bash"
      }}><code parentName="pre" {...{
          "className": "language-bash"
        }}><span parentName="code" {...{
            "className": "token function"
          }}>{`docker`}</span>{` run `}<span parentName="code" {...{
            "className": "token parameter variable"
          }}>{`-it`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"uazhlt/sklearn-demo:latest"`}</span>{` ipython`}</code></pre></div>
    <p><strong parentName="p">{`Alternatively`}</strong>{`, to launch a jupyter notebook server on port 7777, run the following command:`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "bash"
    }}><pre parentName="div" {...{
        "className": "language-bash"
      }}><code parentName="pre" {...{
          "className": "language-bash"
        }}><span parentName="code" {...{
            "className": "token function"
          }}>{`docker`}</span>{` run `}<span parentName="code" {...{
            "className": "token parameter variable"
          }}>{`-it`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`\\`}</span>{`
  `}<span parentName="code" {...{
            "className": "token parameter variable"
          }}>{`-p`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`7777`}</span>{`:9999 `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`\\`}</span>{`
  `}<span parentName="code" {...{
            "className": "token parameter variable"
          }}>{`-v`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"`}<span parentName="span" {...{
              "className": "token variable"
            }}>{`\${`}<span parentName="span" {...{
                "className": "token environment constant"
              }}>{`PWD`}</span>{`}`}</span>{`/notebooks:/app/notebooks"`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`\\`}</span>{`
  uazhlt/sklearn-demo:latest`}</code></pre></div>
    <p>{`First, let's import some things that we'll be using throughout this lesson...`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` sklearn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`feature_extraction`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`text `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` CountVectorizer
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` sklearn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`preprocessing `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` LabelEncoder
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` sklearn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`naive_bayes `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` MultinomialNB
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` typing
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` re`}</code></pre></div>
    <h2 {...{
      "id": "getting-started",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#getting-started",
        "aria-label": "getting started permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Getting started`}</h2>
    <p>{`We're going to step through training a naïve Bayes classifier using a toy dataset with `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-gram features.`}</p>
    <p>{`First, we'll need some documents ...`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`docs_train `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"LOL. I love this site http://scam.com/enter/to/win."`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"My mother loves the lotto"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"Dear Sir or Madam, you have been named in a will to receive a sum of $2000000"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"Will you please stop using that site"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"hot and lonely tortoises in your neighborhood XXX"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"It's lonely being an outlier"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"Announcing the WaffleCOIN ICO"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"Important message from the IRS.  Send your bank account number and SSN immediately to irs-staff@scam.com"`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span>{`

docs_test `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"OMG. Have you seen this site http://scam.com/p0wnd"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"Dear Sir or Madam, I am contacting you about an urgent financial matter. I have taken possession of an abandoned account with a large sum of money ($1,000,000)"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"Did you remember to buy lettuce for the tortoise?"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"UPCOMING ICO! Get ready for MuffinCOIN"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span></code></pre></div>
    <p>{`... and some labels for those documents`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`y_train `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"SPAM"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"HAM"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"SPAM"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"HAM"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"SPAM"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"HAM"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"SPAM"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"SPAM"`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span>{`

y_test `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"SPAM"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"SPAM"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"HAM"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"SPAM"`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span></code></pre></div>
    <h2 {...{
      "id": "generating-and-counting-n-ngrams",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#generating-and-counting-n-ngrams",
        "aria-label": "generating and counting n ngrams permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Generating and counting `}<span parentName="h2" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-ngrams`}</h2>
    <p>{`Now that we have a set of documents reserved for training purposes, let's look at how we can use `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`scikit-learn`}</code>{`'s `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`CountVectorizer`}</code>{` class to generate feature vectors of word or character `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-gram counts.`}</p>
    <p>{`First, we need to create an instance of the `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`CountVectorizer`}</code>{` class that is configured for our use case.`}</p>
    <p>{`Let's look at a subset of parameters we can use to customize the behavior of `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`CountVectorizer`}</code>{` ...`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`vectorizer `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` CountVectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# case fold all text `}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# before generating n-grams`}</span>{`
  lowercase`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`True`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# optionally apply the specified function`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# before counting n-grams`}</span>{`
  preprocessor`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`None`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# optionally provide a list of tokens to remove/ignore before generating n-grams`}</span>{`
  stop_words`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`None`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# specify a range of n-grams as (min_n, max_n). `}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# (1, 1) means unigrams.`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# (1, 2) means unigrams and bigrams`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# (4, 5) means 4-grams and 5-grams`}</span>{`
  ngram_range`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`1`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`1`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# "word", "char" (character), or "char_wb" n-grams`}</span>{`
  analyzer`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"word"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# whether or not to use binary counts`}</span>{`
  binary`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`False`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <p>{`The IPython REPL makes it easy to review the documentation for a class or function by typing the class or function name followed by `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`??`}</code>{`.  To learn more about how to configure and use `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`CountVectorizer`}</code>{`, type `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`CountVectorizer??`}</code>{` in the IPython terminal to view its docstring.  Use the arrow keys to scroll forward and backward through the documentation.  Type `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`q`}</code>{` to quit or exit the documentation screen.`}</p>
    <p>{`Now that we've created an instance of `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`CountVectorizer`}</code>{`, let's see how it's used to turn text into count-based feature vectors. `}</p>
    <h2 {...{
      "id": "determine-the-feature-vocabulary",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#determine-the-feature-vocabulary",
        "aria-label": "determine the feature vocabulary permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Determine the feature vocabulary`}</h2>
    <p>{`We can determine our feature vocabulary using the `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`.fit()`}</code>{` method.  In other words, we'll provide a set of documents (our training data) to come up with a set of `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-gram features (the columns in our feature vectors).  This will allow us to transform unseen data to a familiar form that we can feed into a classifier or use to compute similarity to other text data we transform using the same process (we must maintain the column count and order!).`}</p>
    <p>{`Let's use the toy training data we defined earlier to derive our feature vocabulary:`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`vectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`fit`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`docs_train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <p><code parentName="p" {...{
        "className": "language-text"
      }}>{`.fit()`}</code>{` assigns each unique feature an ID that corresponds to its position in any feature vector we later generate using the `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`.transform()`}</code>{` method of our vectorizer.`}</p>
    <p>{`Once we've fit our `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`CountVectorizer`}</code>{`, we can access a vocabulary mapping (item `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow">{`→`}</mo></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\rightarrow`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.3669em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span></span></span></span></span>{` index):`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`vectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`vocabulary_`}</code></pre></div>
    <p>{`These (key, value) mappings tell us the column index (value) of each feature (key)in every feature vector we generate using this vectorizer.`}</p>
    <p>{`Now that we've determined our vocabulary, let's apply our vectorizer to some data using `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`.transform()`}</code>{`.`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`vectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`transform`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`
    `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span>{`
        `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"It's going to cost you $23,030.12 or more."`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
        `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"Pay here: http://super-sketchy-site.info"`}</span>{`
    `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`todense`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <p>{`Notice that the matrix that is returned is very `}<strong parentName="p">{`sparse`}</strong>{`.  Most of the values are zero.  Only a few are non-zero.  For a very large feature space, this is an inefficient way to store this information.  We really only need to keep track of the dimensions of the matrix and the values of any non-zero entries.  By default, scikit-learn returns just such a representation.  the `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`.todense()`}</code>{` call above coverted the low-memory sparse matrix format to an unabreviated format that includes all the zeros.  Since most of our feature vectors are likely to be very sparse, you'll typically want to work with the sparse matrix format.  `}</p>
    <h2 {...{
      "id": "values-to-features",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#values-to-features",
        "aria-label": "values to features permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Values to features`}</h2>
    <p>{`For ease of inspection, let's create a reverse mapping from (index `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow">{`→`}</mo></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\rightarrow`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.3669em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span></span></span></span></span>{` item).`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`i2v `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token builtin"
          }}>{`dict`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`i`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` v`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{` `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`for`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`v`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` i`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{` `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`in`}</span>{` vectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`vocabulary_`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`items`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <p>{`What is the feature in the first position (index = 0)?`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token comment"
          }}>{`# what is the feature in the first position (index=0)?`}</span>{`
i2v`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`0`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span></code></pre></div>
    <p>{`What is the feature in the sixth position (index = 5)?`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token comment"
          }}>{`# what is the feature in the sixth position (index = 5)?`}</span>{`
i2v`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`5`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span></code></pre></div>
    <p>{`Alternatively, we can transform some data and then map it back to feature names (note the use of `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`.todense()`}</code>{` here) to guarantee that the first element in each array corresponds to the index = 0 feature in our vocabulary.`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`vectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`inverse_transform`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`
    vectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`transform`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"It's going to cost you $23,030.12 or more."`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`todense`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <p>{`We've configured our vectorizer to use `}<em parentName="p">{`unigram`}</em>{` features.  How many features do have in our vocabulary?`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token builtin"
          }}>{`len`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`vectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`vocabulary_`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <h1 {...{
      "id": "encoding-labels",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#encoding-labels",
        "aria-label": "encoding labels permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Encoding labels`}</h1>
    <p>{`Now that we have a utility to generate our feature vectors, let's prepare our labels for training.`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`lbl_encoder `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` LabelEncoder`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# map distinct strings to integers`}</span>{`
lbl_encoder`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`fit`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`y_train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# prints  array(['HAM', 'SPAM'], dtype='<U4')`}</span>{`
lbl_encoder`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`classes_`}</code></pre></div>
    <p>{`Like the vectorizer, we must `}<em parentName="p">{`fit`}</em>{` our label encoder to our data in order to establish a consistent string `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow">{`→`}</mo></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\rightarrow`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.3669em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span></span></span></span></span>{` int mapping.  Once we've fit the label encoder, we can access an array of distinct class labels using the `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`.classes_`}</code>{` attribute.  The index of each label corresponds to how that label will be represented as an integer.`}</p>
    <p>{`Let's try `}<strong parentName="p">{`transforming`}</strong>{` some string representations of labels to integer form...`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token comment"
          }}>{`# prints array([0])`}</span>{`
lbl_encoder`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`transform`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"HAM"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <h1 {...{
      "id": "classifying-documents",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#classifying-documents",
        "aria-label": "classifying documents permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Classifying documents`}</h1>
    <h2 {...{
      "id": "training",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#training",
        "aria-label": "training permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Training`}</h2>
    <p>{`Now that we can transform documents into feature vectors and represent class labels as vector of integers, let's train a classifier.  Here we'll use a naïve Bayes classifier, but the API we'll use to train and predict is shared by all classifiers defined in scikit-learn. `}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`clf `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` MultinomialNB`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# should we learn p(y) from our data?`}</span>{`
  fit_prior`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`True`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# train the classifier by fitting it to some X and y`}</span>{`
clf`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`fit`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`vectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`transform`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`docs_train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` lbl_encoder`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`transform`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`y_train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# prints array([0, 1])`}</span>{`
clf`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`classes_`}</code></pre></div>
    <h2 {...{
      "id": "predicting",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#predicting",
        "aria-label": "predicting permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Predicting`}</h2>
    <p>{`Now that we've trained our classifier, we can use it to make predictions:`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`yhats       `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` clf`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`predict`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`vectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`transform`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`docs_test`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# convert ints to strings for readability`}</span>{`
predictions `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` lbl_encoder`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`inverse_transform`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`yhats`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# let's compare our predictions to our true labels...`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`for`}</span>{` pred`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` gold `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`in`}</span>{` `}<span parentName="code" {...{
            "className": "token builtin"
          }}>{`zip`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`predictions`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` y_test`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
  `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`print`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token string-interpolation"
          }}><span parentName="span" {...{
              "className": "token string"
            }}>{`f"PRED: `}</span><span parentName="span" {...{
              "className": "token interpolation"
            }}><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`{`}</span>{`pred`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`}`}</span></span><span parentName="span" {...{
              "className": "token string"
            }}>{` \\tGOLD: `}</span><span parentName="span" {...{
              "className": "token interpolation"
            }}><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`{`}</span>{`gold`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`}`}</span></span><span parentName="span" {...{
              "className": "token string"
            }}>{`"`}</span></span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <p>{`Of course, this is just a toy dataset without any form of preprocessing or normalization being applied. `}</p>
    <h1 {...{
      "id": "next-steps",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#next-steps",
        "aria-label": "next steps permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Next steps`}</h1>
    <p>{`You now have a sense of how to use scikit-learn to extract `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-gram features and train a very simple classifier.  In future lessons, we'll look at ways of improving this pipeline.  Before moving on, though, let's practice ...`}</p>
    <h1 {...{
      "id": "practice",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#practice",
        "aria-label": "practice permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Practice`}</h1>
    <ul>
      <li parentName="ul">{`What values for `}<span parentName="li" {...{
          "className": "math math-inline"
        }}><span parentName="span" {...{
            "className": "katex"
          }}><span parentName="span" {...{
              "className": "katex-mathml"
            }}><math parentName="span" {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML"
              }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                    "encoding": "application/x-tex"
                  }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
              "className": "katex-html",
              "aria-hidden": "true"
            }}><span parentName="span" {...{
                "className": "base"
              }}><span parentName="span" {...{
                  "className": "strut",
                  "style": {
                    "height": "0.4306em"
                  }
                }}></span><span parentName="span" {...{
                  "className": "mord mathnormal"
                }}>{`n`}</span></span></span></span></span>{` are we using with the our `}<code parentName="li" {...{
          "className": "language-text"
        }}>{`CountVectorizer`}</code>{` instance?`}</li>
      <li parentName="ul">{`Is "pay" represented in the matrix that is returned by `}<code parentName="li" {...{
          "className": "language-text"
        }}>{`.transform()`}</code>{`?  Why or why not?`}</li>
      <li parentName="ul">{`Why shouldn't you use `}<code parentName="li" {...{
          "className": "language-text"
        }}>{`.fit()`}</code>{` or `}<code parentName="li" {...{
          "className": "language-text"
        }}>{`.fit_transform()`}</code>{` at prediction time?`}</li>
    </ul>
    <h2 {...{
      "id": "unknown-features",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#unknown-features",
        "aria-label": "unknown features permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Unknown features`}</h2>
    <ul>
      <li parentName="ul">{`What happens if we pass a datum composed soley of unseen/unknown features?`}</li>
    </ul>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`vectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`transform`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"ZAMBORTANI DIEMPO"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`todense`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`vectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`transform`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"Kltpzyxm"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`todense`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`vectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`transform`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"$20.00"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`todense`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <ul>
      <li parentName="ul">
        <p parentName="li">{`What happens when we pass the `}<code parentName="p" {...{
            "className": "language-text"
          }}>{`.transform()`}</code>{` method of our label encoder instance an unseen label after having called `}<code parentName="p" {...{
            "className": "language-text"
          }}>{`.fit()`}</code>{`?`}</p>
      </li>
      <li parentName="ul">
        <p parentName="li">{`Why is it that calling `}<code parentName="p" {...{
            "className": "language-text"
          }}>{`.transform()`}</code>{` on either `}<strong parentName="p">{`ZAMBORTANI DIEMPO`}</strong>{` or `}<strong parentName="p">{`Kltpzyxm`}</strong>{` results in vectors of the same length?`}</p>
      </li>
      <li parentName="ul">
        <p parentName="li">{`Why are all values in both `}<strong parentName="p">{`ZAMBORTANI DIEMPO`}</strong>{` and `}<strong parentName="p">{`Kltpzyxm`}</strong>{` vectors 0?`}</p>
      </li>
    </ul>

    <div {...{
      "className": "footnotes"
    }}>
      <hr parentName="div"></hr>
      <ol parentName="div">
        <li parentName="ol" {...{
          "id": "fn-1"
        }}>{`clustering methods`}<a parentName="li" {...{
            "href": "#fnref-1",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
      </ol>
    </div>
    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      