import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */

/* @jsx mdx */

import DefaultLayout from "/home/runner/work/myedibleenso.github.io/myedibleenso.github.io/src/components/BasicLayout.js";
export const _frontmatter = {};
const layoutProps = {
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">


    <h2 {...{
      "id": "custom-preprocessor",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#custom-preprocessor",
        "aria-label": "custom preprocessor permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Custom preprocessor`}</h2>
    <p>{`For example, perhaps we want to plug in our own tokenizer or convert all text to lower case before generating `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-gram counts.`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token comment"
          }}>{`# see https://www.sphinx-doc.org/en/master/usage/restructuredtext/domains.html#python-signatures`}</span>{`

`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`def`}</span>{` `}<span parentName="code" {...{
            "className": "token function"
          }}>{`text_normalizer`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`text`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` `}<span parentName="code" {...{
            "className": "token builtin"
          }}>{`str`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{` `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`-`}</span><span parentName="code" {...{
            "className": "token operator"
          }}>{`>`}</span>{` `}<span parentName="code" {...{
            "className": "token builtin"
          }}>{`str`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
    `}<span parentName="code" {...{
            "className": "token triple-quoted-string string"
          }}>{`'''
    Normalizes text (ex. $42.32 -> CURRENCY).
    
    :param str text: The raw text to be normalized
    :return: normalized text
    :rtype: str
    '''`}</span>{`
    normalized `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` text`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`lower`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

    CURRENCY   `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` re`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span><span parentName="code" {...{
            "className": "token builtin"
          }}>{`compile`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"\\$\\d[\\d,]*\\.?\\d{0,2}"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
    URL        `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` re`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span><span parentName="code" {...{
            "className": "token builtin"
          }}>{`compile`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"https?://[^\\s]+?(?=\\.?$|[\\.,]\\s)"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

    normalized `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` re`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`sub`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`pattern`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`CURRENCY`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` repl`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"CURRENCY"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` string`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`normalized`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
    normalized `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` re`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`sub`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`pattern`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`URL`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` repl`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"URL"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` string`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`normalized`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
    
    normalized `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` normalized`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`strip`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
    `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`return`}</span>{` normalized`}</code></pre></div>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token comment"
          }}>{`# test our normalizer`}</span>{`
text_normalizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"It's going to cost you $23,030.12 or more.  Send a payment to http://scam-you-later.com."`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <blockquote>
      <p parentName="blockquote">{`"it's going to cost you CURRENCY or more.  send a payment to URL."`}</p>
    </blockquote>
    <h2 {...{
      "id": "create-our-count-vectorizer",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#create-our-count-vectorizer",
        "aria-label": "create our count vectorizer permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Create our count vectorizer¶`}</h2>
    <p>{`We'll register our text_normalizer with an instance of `}<a parentName="p" {...{
        "href": "https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html",
        "target": "_self",
        "rel": "nofollow"
      }}><code parentName="a" {...{
          "className": "language-text"
        }}>{`CountVectorizer`}</code></a>{`.`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`vectorizer `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` CountVectorizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`
        encoding`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`'utf-8'`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
        `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# let's register our custom normalization function`}</span>{`
        preprocessor`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`text_normalizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
        stop_words`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`None`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
        `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# we'll use word n-grams `}</span>{`
        `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# from size 1 (unigrams) to 3 (trigrams)`}</span>{`
        ngram_range`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`1`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`3`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
        binary`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`False`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`docs_train `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span>{`
    `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"It's going to cost you $23,030.12 or more."`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
    `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"He charged me $10 for that banana."`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
    `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"Check this out: http://house-elves.com"`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span></code></pre></div>

    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      