import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */
/* @jsx mdx */
import DefaultLayout from "/home/runner/work/myedibleenso.github.io/myedibleenso.github.io/src/components/BasicLayout.js";
export const _frontmatter = {};
const layoutProps = {
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">


    <h1 {...{
      "id": "overview",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#overview",
        "aria-label": "overview permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Overview`}</h1>
    <p>{`This lesson provides an introduction to text normalization.`}</p>
    <h1 {...{
      "id": "outcomes",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#outcomes",
        "aria-label": "outcomes permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Outcomes`}</h1>
    <ul>
      <li parentName="ul">{`define text normalization`}</li>
      <li parentName="ul">{`explain how text normalization is used`}</li>
      <li parentName="ul">{`present examples of text normalization`}</li>
    </ul>
    <h2 {...{
      "id": "prerequisites",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#prerequisites",
        "aria-label": "prerequisites permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Prerequisites`}</h2>
    <p>{`Before starting this tutorial, ensure that ...`}</p>
    <ul>
      <li parentName="ul">{`you are `}<a parentName="li" {...{
          "href": "/tutorials/tokens"
        }}>{`familiar with tokens and types`}</a></li>
      <li parentName="ul">{`you are `}<a parentName="li" {...{
          "href": "/tutorials/parts-of-speech"
        }}>{`familiar with parts of speech`}</a></li>
    </ul>
    <h1 {...{
      "id": "background",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#background",
        "aria-label": "background permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Background`}</h1>
    <p>{`Though it isn't frequently discussed in the literature, `}<strong parentName="p">{`data cleaning (ex. text normalization) and preprocessing is often the most time-consuming step in any analysis or machine learning pipeline`}</strong>{`.`}</p>
    <p><em parentName="p">{`What is text normalization?`}</em></p>
    <p>{`Real-world, unstructured text data is messy.  It's often full of typos and inconsistently formatted.  If you're scraping data from a website, you may find yourself needing to sift through a soup of broken HTML and JavaScript.  `}<strong parentName="p">{`Text normalization`}</strong>{` is a set of techniques designed to increase the uniformity of text.`}</p>
    <p><em parentName="p">{`Why normalize text?`}</em></p>
    <p>{`Text normalization is an essential `}<em parentName="p">{`preprocessing`}</em>{` step in many natural language processing applications, such as...`}</p>
    <ul>
      <li parentName="ul"><strong parentName="li">{`data canonicalization`}</strong><sup parentName="li" {...{
          "id": "fnref-1"
        }}><a parentName="sup" {...{
            "href": "#fn-1",
            "className": "footnote-ref"
          }}>{`1`}</a></sup></li>
    </ul>
    <ul>
      <li parentName="ul">{`email addresses (ex. "username AT domain DOT org" vs. "`}<a parentName="li" {...{
          "href": "mailto:username@domain.org",
          "target": "_self",
          "rel": "nofollow"
        }}>{`username@domain.org`}</a>{`")`}</li>
      <li parentName="ul">{`phone numbers (ex. "1.111.111.1111" vs. "1 (111) 111-1111")`}</li>
      <li parentName="ul">{`etc.`}</li>
    </ul>
    <ul>
      <li parentName="ul"><strong parentName="li">{`information retrieval`}</strong>{` (IR)`}<sup parentName="li" {...{
          "id": "fnref-8"
        }}><a parentName="sup" {...{
            "href": "#fn-8",
            "className": "footnote-ref"
          }}>{`8`}</a></sup>
        <ul parentName="li">
          <li parentName="ul">{`query normalization (ex. expanding acronyms, correcting spelling errors, case folding, canonicalizing emoticons, etc.)
`}<sup parentName="li" {...{
              "id": "fnref-8"
            }}><a parentName="sup" {...{
                "href": "#fn-8",
                "className": "footnote-ref"
              }}>{`8`}</a></sup>{`: You'll learn about IR in LING 531`}</li>
        </ul>
      </li>
    </ul>
    <ul>
      <li parentName="ul">
        <p parentName="li"><strong parentName="p">{`text to speech systems (TTS)`}</strong>{` `}</p>
        <ul parentName="li">
          <li parentName="ul">{`"$3.50" `}{`→`}{` "three dollars and fifty cents"`}</li>
        </ul>
      </li>
      <li parentName="ul">
        <p parentName="li"><strong parentName="p">{`machine learning (ML)`}</strong></p>
        <ul parentName="li">
          <li parentName="ul">
            <p parentName="li">{`for example, if we're detecting spam, knowing that an email contains a URL could be an informative feature, but perhaps not the attached query strings.  Similarly, the domain of a URL make for a good feature (e.g., "`}<a parentName="p" {...{
                "href": "https://domain.org/stuff%22",
                "target": "_self",
                "rel": "nofollow"
              }}>{`https://domain.org/stuff"`}</a>{` `}{`→`}{` "domain.org").`}</p>
          </li>
          <li parentName="ul">
            <p parentName="li">{`enforcing consistent input for training and using statistical classifiers (ex. part of speech taggers, syntactic dependency parsers, etc.)`}</p>
          </li>
          <li parentName="ul">
            <p parentName="li">{`reducing the feature space`}</p>
            <ul parentName="li">
              <li parentName="ul">{`in a bag of words classifier`}<sup parentName="li" {...{
                  "id": "fnref-2"
                }}><a parentName="sup" {...{
                    "href": "#fn-2",
                    "className": "footnote-ref"
                  }}>{`2`}</a></sup>{`, each distinct word (token) is treated as a feature.  Words that occur extremely infrequently`}<sup parentName="li" {...{
                  "id": "fnref-3"
                }}><a parentName="sup" {...{
                    "href": "#fn-3",
                    "className": "footnote-ref"
                  }}>{`3`}</a></sup>{` overall are unlikely to be reliable features.  Some normalization might help to collapse distinct categories.`}</li>
            </ul>
          </li>
          <li parentName="ul">
            <p parentName="li">{`probability estimates`}</p>
            <ul parentName="li">
              <li parentName="ul">{`this will come up in later units, but a lack of normalization can lead to underestimating probabilities.  This can have a deterimental effect on probabilistic language models`}<sup parentName="li" {...{
                  "id": "fnref-4"
                }}><a parentName="sup" {...{
                    "href": "#fn-4",
                    "className": "footnote-ref"
                  }}>{`4`}</a></sup>{` and classifiers that use probabilities as features`}</li>
            </ul>
          </li>
        </ul>
      </li>
      <li parentName="ul">
        <p parentName="li"><strong parentName="p">{`spelling correction`}</strong>{` `}</p>
        <ul parentName="li">
          <li parentName="ul">{`detecting and correcting typographical errors`}</li>
        </ul>
      </li>
    </ul>
    <p><em parentName="p">{`Why `}<strong parentName="em">{`not`}</strong>{` normalize text?`}</em></p>
    <p>{`Some differences could be informative to a task. For example, use of punctuation (ex. `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`!!!!`}</code>{`) or the dialectal distinction between `}<em parentName="p">{`color`}</em>{` and `}<em parentName="p">{`colour`}</em>{` might be useful features in certain classification tasks such as authorship detection.  Similarly, making the case uniform through case folding (all uppercase or all lowercase) might make it more difficult to distinguish between certain proper and non-proper nouns.`}</p>
    <table>
      <thead parentName="table">
        <tr parentName="thead">
          <th parentName="tr" {...{
            "align": null
          }}>{`Original`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`Case-folded version`}</th>
        </tr>
      </thead>
      <tbody parentName="table">
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Please call about `}<strong parentName="td">{`ABE`}</strong><sup parentName="td" {...{
              "id": "fnref-5"
            }}><a parentName="sup" {...{
                "href": "#fn-5",
                "className": "footnote-ref"
              }}>{`5`}</a></sup></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`please call about `}<strong parentName="td">{`abe`}</strong></td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Shinzo `}<strong parentName="td">{`Abe`}</strong>{` called Trudeau`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`shinzo `}<strong parentName="td">{`abe`}</strong>{` called trudeau`}</td>
        </tr>
      </tbody>
    </table>
    <h1 {...{
      "id": "techniques",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#techniques",
        "aria-label": "techniques permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Techniques`}</h1>
    <p>{`Here we'll examine a few common techniques for text normalization.`}</p>
    <h2 {...{
      "id": "case-folding",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#case-folding",
        "aria-label": "case folding permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Case folding`}</h2>
    <p>{`The process of make the case of all text uniform (ex. uppercase or lowercase).`}</p>
    <table>
      <thead parentName="table">
        <tr parentName="thead">
          <th parentName="tr" {...{
            "align": null
          }}>{`Raw`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`case-folded version`}</th>
        </tr>
      </thead>
      <tbody parentName="table">
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`I LiKe TuRtLeS`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`i like turtles`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`I LIKE TURTLES`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`i like turtles`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`I LIKE turtles`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`i like turtles`}</td>
        </tr>
      </tbody>
    </table>
    <h2 {...{
      "id": "replacement",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#replacement",
        "aria-label": "replacement permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Replacement`}</h2>
    <p>{`It can be useful to standardize text by replacing elements of a shared type with some categorical label.  For an example, see th table below:`}</p>
    <table>
      <thead parentName="table">
        <tr parentName="thead">
          <th parentName="tr" {...{
            "align": null
          }}>{`Raw`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`Replacement URL`}</th>
        </tr>
      </thead>
      <tbody parentName="table">
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`"`}<a parentName="td" {...{
              "href": "https://parsertongue.org%22",
              "target": "_self",
              "rel": "nofollow"
            }}>{`https://parsertongue.org"`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`URL`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`"`}<a parentName="td" {...{
              "href": "https://arizona.edu%22",
              "target": "_self",
              "rel": "nofollow"
            }}>{`https://arizona.edu"`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`URL`}</td>
        </tr>
      </tbody>
    </table>
    <p>{`The replacement can be at varying levels of specificity.  For example, ...`}</p>
    <table>
      <thead parentName="table">
        <tr parentName="thead">
          <th parentName="tr" {...{
            "align": null
          }}>{`Raw`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`Replacement URL`}</th>
        </tr>
      </thead>
      <tbody parentName="table">
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`"`}<a parentName="td" {...{
              "href": "https://parsertongue.org%22",
              "target": "_self",
              "rel": "nofollow"
            }}>{`https://parsertongue.org"`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`parsertongue.org`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`"`}<a parentName="td" {...{
              "href": "https://parsertongue.org/about%22",
              "target": "_self",
              "rel": "nofollow"
            }}>{`https://parsertongue.org/about"`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`parsertongue.org`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`"`}<a parentName="td" {...{
              "href": "https://arizona.edu%22",
              "target": "_self",
              "rel": "nofollow"
            }}>{`https://arizona.edu"`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`arizona.edu`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`"`}<a parentName="td" {...{
              "href": "https://linguistics.arizona.edu/ma-native-american-languages-linguistics%22",
              "target": "_self",
              "rel": "nofollow"
            }}>{`https://linguistics.arizona.edu/ma-native-american-languages-linguistics"`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`linguistics.arizona.edu`}</td>
        </tr>
      </tbody>
    </table>
    <h3 {...{
      "id": "lemmatization",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#lemmatization",
        "aria-label": "lemmatization permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Lemmatization`}</h3>
    <p><strong parentName="p">{`Lemmatization`}</strong>{` is the assignment of a canonical`}<sup parentName="p" {...{
        "id": "fnref-6"
      }}><a parentName="sup" {...{
          "href": "#fn-6",
          "className": "footnote-ref"
        }}>{`6`}</a></sup>{` to all members of a group sharing the same `}<a parentName="p" {...{
        "href": "https://en.wikipedia.org/wiki/Lexeme",
        "target": "_self",
        "rel": "nofollow"
      }}>{`lexeme`}</a>{`.  For each token sharing a lexeme, replace the token with its lemma form (i.e., the form you'd see in the dictionary).`}</p>
    <p><span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`{`}</mo><mi parentName="mrow">{`f`}</mi><mi parentName="mrow">{`i`}</mi><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`h`}</mi><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`,`}</mo><mi parentName="mrow">{`f`}</mi><mi parentName="mrow">{`i`}</mi><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`h`}</mi><mi parentName="mrow">{`i`}</mi><mi parentName="mrow">{`n`}</mi><mi parentName="mrow">{`g`}</mi><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`,`}</mo><mi parentName="mrow">{`f`}</mi><mi parentName="mrow">{`i`}</mi><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`h`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`s`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`}`}</mo><mo parentName="mrow">{`→`}</mo><mi parentName="mrow">{`f`}</mi><mi parentName="mrow">{`i`}</mi><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`h`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\{fish, fishing, fishes\\} \\rightarrow fish`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1em",
                  "verticalAlign": "-0.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "mopen"
              }}>{`{`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.10764em"
                }
              }}>{`f`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`i`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`h`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`,`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.10764em"
                }
              }}>{`f`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`i`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`hin`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03588em"
                }
              }}>{`g`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`,`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.10764em"
                }
              }}>{`f`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`i`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`h`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`es`}</span><span parentName="span" {...{
                "className": "mclose"
              }}>{`}`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.8889em",
                  "verticalAlign": "-0.1944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.10764em"
                }
              }}>{`f`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`i`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`h`}</span></span></span></span></span></p>
    <p><span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`{`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`n`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`a`}</mi><mi parentName="mrow">{`k`}</mi><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`,`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`n`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`a`}</mi><mi parentName="mrow">{`k`}</mi><mi parentName="mrow">{`i`}</mi><mi parentName="mrow">{`n`}</mi><mi parentName="mrow">{`g`}</mi><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`,`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`n`}</mi><mi parentName="mrow">{`u`}</mi><mi parentName="mrow">{`c`}</mi><mi parentName="mrow">{`k`}</mi><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`,`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`n`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`a`}</mi><mi parentName="mrow">{`k`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`d`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`}`}</mo><mo parentName="mrow">{`→`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`n`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`a`}</mi><mi parentName="mrow">{`k`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\{sneak, sneaking, snuck, sneaked\\} \\rightarrow sneak`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1em",
                  "verticalAlign": "-0.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "mopen"
              }}>{`{`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`e`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03148em"
                }
              }}>{`ak`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`,`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`e`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`akin`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03588em"
                }
              }}>{`g`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`,`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`u`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`c`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03148em"
                }
              }}>{`k`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`,`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`e`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03148em"
                }
              }}>{`ak`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`e`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`d`}</span><span parentName="span" {...{
                "className": "mclose"
              }}>{`}`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`e`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03148em"
                }
              }}>{`ak`}</span></span></span></span></span><sup parentName="p" {...{
        "id": "fnref-7"
      }}><a parentName="sup" {...{
          "href": "#fn-7",
          "className": "footnote-ref"
        }}>{`7`}</a></sup></p>
    <p><span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`{`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`n`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`a`}</mi><mi parentName="mrow">{`k`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`r`}</mi><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`,`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`n`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`a`}</mi><mi parentName="mrow">{`k`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`r`}</mi><mi parentName="mrow">{`s`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`}`}</mo><mo parentName="mrow">{`→`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`n`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`a`}</mi><mi parentName="mrow">{`k`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`r`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\{sneaker, sneakers\\} \\rightarrow sneaker`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1em",
                  "verticalAlign": "-0.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "mopen"
              }}>{`{`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`e`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03148em"
                }
              }}>{`ak`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.02778em"
                }
              }}>{`er`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`,`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`e`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03148em"
                }
              }}>{`ak`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`ers`}</span><span parentName="span" {...{
                "className": "mclose"
              }}>{`}`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`e`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03148em"
                }
              }}>{`ak`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.02778em"
                }
              }}>{`er`}</span></span></span></span></span></p>
    <p><span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`{`}</mo><mi parentName="mrow">{`f`}</mi><mi parentName="mrow">{`a`}</mi><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`t`}</mi><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`,`}</mo><mi parentName="mrow">{`f`}</mi><mi parentName="mrow">{`a`}</mi><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`t`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`r`}</mi><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`,`}</mo><mi parentName="mrow">{`f`}</mi><mi parentName="mrow">{`a`}</mi><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`t`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`t`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`}`}</mo><mo parentName="mrow">{`→`}</mo><mi parentName="mrow">{`f`}</mi><mi parentName="mrow">{`a`}</mi><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`t`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\{fast, faster, fastest\\} \\rightarrow fast`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1em",
                  "verticalAlign": "-0.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "mopen"
              }}>{`{`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.10764em"
                }
              }}>{`f`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`a`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`,`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.10764em"
                }
              }}>{`f`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`a`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.02778em"
                }
              }}>{`er`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`,`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.10764em"
                }
              }}>{`f`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`a`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`es`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span><span parentName="span" {...{
                "className": "mclose"
              }}>{`}`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.8889em",
                  "verticalAlign": "-0.1944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.10764em"
                }
              }}>{`f`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`a`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span></span></span></span></span></p>
    <p>{`If performed correctly, lemmatization should `}<strong parentName="p">{`not`}</strong>{` change a word's coarse-grained grammatical category:`}</p>
    <p><span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`{`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`t`}</mi><mi parentName="mrow">{`u`}</mi><mi parentName="mrow">{`d`}</mi><mi parentName="mrow">{`i`}</mi><mi parentName="mrow">{`o`}</mi><mi parentName="mrow">{`u`}</mi><mi parentName="mrow">{`s`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`}`}</mo><mo parentName="mrow">{`→`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`t`}</mi><mi parentName="mrow">{`u`}</mi><mi parentName="mrow">{`d`}</mi><mi parentName="mrow">{`i`}</mi><mi parentName="mrow">{`o`}</mi><mi parentName="mrow">{`u`}</mi><mi parentName="mrow">{`s`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\{studious\\} \\rightarrow studious`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1em",
                  "verticalAlign": "-0.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "mopen"
              }}>{`{`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`u`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`d`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`i`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`o`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`u`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mclose"
              }}>{`}`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`u`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`d`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`i`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`o`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`u`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span></span></span></span></span></p>
    <p><span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`{`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`t`}</mi><mi parentName="mrow">{`u`}</mi><mi parentName="mrow">{`d`}</mi><mi parentName="mrow">{`i`}</mi><mi parentName="mrow">{`e`}</mi><mi parentName="mrow">{`d`}</mi><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`,`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`t`}</mi><mi parentName="mrow">{`u`}</mi><mi parentName="mrow">{`d`}</mi><mi parentName="mrow">{`y`}</mi><mi parentName="mrow">{`i`}</mi><mi parentName="mrow">{`n`}</mi><mi parentName="mrow">{`g`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`}`}</mo><mo parentName="mrow">{`→`}</mo><mi parentName="mrow">{`s`}</mi><mi parentName="mrow">{`t`}</mi><mi parentName="mrow">{`u`}</mi><mi parentName="mrow">{`d`}</mi><mi parentName="mrow">{`y`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\{studied, studying\\} \\rightarrow study`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1em",
                  "verticalAlign": "-0.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "mopen"
              }}>{`{`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`u`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`d`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`i`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`e`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`d`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`,`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`u`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`d`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03588em"
                }
              }}>{`y`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`in`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03588em"
                }
              }}>{`g`}</span><span parentName="span" {...{
                "className": "mclose"
              }}>{`}`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.8889em",
                  "verticalAlign": "-0.1944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`s`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`u`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`d`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03588em"
                }
              }}>{`y`}</span></span></span></span></span></p>
    <p>{`This means that `}<em parentName="p">{`lemmatization relies on correctly identifying the `}<strong parentName="em">{`part of speech`}</strong>{` assigned to a token`}</em>{`:`}</p>
    <table>
      <thead parentName="table">
        <tr parentName="thead">
          <th parentName="tr" {...{
            "align": null
          }}>{`Word in context`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`Coarse POS category`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`Lemma`}</th>
        </tr>
      </thead>
      <tbody parentName="table">
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`the carpenter's  `}<strong parentName="td">{`saw`}</strong>{`  is shiny`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`NOUN`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`saw`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`I  `}<strong parentName="td">{`saw`}</strong>{`  the boy with the telescope`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`VERB`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`see`}</td>
        </tr>
      </tbody>
    </table>
    <h2 {...{
      "id": "removal",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#removal",
        "aria-label": "removal permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Removal`}</h2>
    <p>{`It is not uncommon to remove certain text such as punctuation to make the data more uniform. If processing many documents from the same source, it may be useful to remove metadata.  For example, if you were processing text from `}<a parentName="p" {...{
        "href": "https://www.gutenberg.org/",
        "target": "_self",
        "rel": "nofollow"
      }}>{`Project Gutenberg`}</a>{`, you might want to remove or ignore content such as the table of contents and chapter headings.`}</p>
    <h2 {...{
      "id": "frequency-based-thresholding",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#frequency-based-thresholding",
        "aria-label": "frequency based thresholding permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Frequency-based thresholding`}</h2>
    <p>{`Depending on how you plan to use the text (ex. training classifiers), you may wish to outright discard tokens below some frequency or replace them with some common symbol. For example, you may want to replace all singularly occurring tokens with the symbol `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`UNK`}</code>{` ("unknown").  This will be covered in LING 539.`}</p>
    <h2 {...{
      "id": "stemming",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#stemming",
        "aria-label": "stemming permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Stemming`}</h2>
    <p><strong parentName="p">{`Stemming`}</strong>{` is a string transformation process that aims to reduce a word down to some base form.  Unlike lemmatization, stemming may not return real words.  `}</p>
    <p>{`Typically, stemming is performed using a series of string transformation rules to iteratively whittle a token down to some simplified representation:`}</p>
    <p><code parentName="p" {...{
        "className": "language-text"
      }}>{`enthusiastically`}</code><br parentName="p"></br>{`
`}<code parentName="p" {...{
        "className": "language-text"
      }}>{`ally`}</code>{` `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow">{`→`}</mo></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\rightarrow`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.3669em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span></span></span></span></span>{` `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`al`}</code><br parentName="p"></br>{`
`}<code parentName="p" {...{
        "className": "language-text"
      }}>{`enthusiastical`}</code><br parentName="p"></br>{`
`}<code parentName="p" {...{
        "className": "language-text"
      }}>{`al`}</code>{` `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow">{`→`}</mo></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\rightarrow`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.3669em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span></span></span></span></span>{` `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`ϵ`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\epsilon`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`ϵ`}</span></span></span></span></span><br parentName="p"></br>{`
`}<code parentName="p" {...{
        "className": "language-text"
      }}>{`enthusiastic`}</code>{`
`}<code parentName="p" {...{
        "className": "language-text"
      }}>{`iastic`}</code>{` `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow">{`→`}</mo></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\rightarrow`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.3669em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span></span></span></span></span>{` `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`istic`}</code><br parentName="p"></br>{`
`}<code parentName="p" {...{
        "className": "language-text"
      }}>{`enthusistic`}</code><br parentName="p"></br>{`
`}<code parentName="p" {...{
        "className": "language-text"
      }}>{`istic`}</code>{` `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow">{`→`}</mo></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\rightarrow`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.3669em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span></span></span></span></span>{` `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`ist`}</code><br parentName="p"></br>{`
`}<code parentName="p" {...{
        "className": "language-text"
      }}>{`enthusist`}</code><br parentName="p"></br>{`
`}<code parentName="p" {...{
        "className": "language-text"
      }}>{`ist`}</code>{` `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow">{`→`}</mo></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\rightarrow`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.3669em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span></span></span></span></span>{` `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`ϵ`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\epsilon`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`ϵ`}</span></span></span></span></span><br parentName="p"></br>{`
`}<code parentName="p" {...{
        "className": "language-text"
      }}>{`enthus`}</code>{`  `}</p>
    <p>{`There are two types of errors assocated with stemming: `}<em parentName="p">{`overstemming`}</em>{` and `}<em parentName="p">{`understemming`}</em>{`.`}</p>
    <p><strong parentName="p">{`Overstemming`}</strong>{` is when two words with distinct meanings are reduced to the same stem:`}</p>
    <table>
      <thead parentName="table">
        <tr parentName="thead">
          <th parentName="tr" {...{
            "align": null
          }}>{`Original`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`over-stemmed form`}</th>
        </tr>
      </thead>
      <tbody parentName="table">
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`universal`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`univers`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`university`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`univers`}</td>
        </tr>
      </tbody>
    </table>
    <p><strong parentName="p">{`Understemming`}</strong>{` is when two words with highly related meanings are not reduced to the same stem:`}</p>
    <table>
      <thead parentName="table">
        <tr parentName="thead">
          <th parentName="tr" {...{
            "align": null
          }}>{`Original`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`under-stemmed form`}</th>
        </tr>
      </thead>
      <tbody parentName="table">
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`alumna`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`alumna`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`alumni`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`alumni`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`alumnus`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`alumnus`}</td>
        </tr>
      </tbody>
    </table>
    <p>{`You'll learn more about stemming algorithms in LING 538.`}</p>
    <h1 {...{
      "id": "next-steps",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#next-steps",
        "aria-label": "next steps permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Next steps`}</h1>
    <p>{`You've learned the motivation behind text normalization as well as places where it may cause issues. It's time to apply what you've learned...`}</p>
    <h2 {...{
      "id": "practice",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#practice",
        "aria-label": "practice permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Practice`}</h2>
    <ul>
      <li parentName="ul">
        <p parentName="li">{`Imagine you are working with data from `}<a parentName="p" {...{
            "href": "https://twitter.com",
            "target": "_self",
            "rel": "nofollow"
          }}>{`Twitter`}</a>{`.  Describe some text normalization/cleanup procedures you might use to preprocess the data.`}</p>
      </li>
      <li parentName="ul">
        <p parentName="li">{`Imagine you are working with emails.  Describe some text normalization/cleanup procedures you might use to preprocess the data.`}</p>
      </li>
    </ul>

    <div {...{
      "className": "footnotes"
    }}>
      <hr parentName="div"></hr>
      <ol parentName="div">
        <li parentName="ol" {...{
          "id": "fn-1"
        }}>{`i.e., ensuring the same format is used for all data belonging to some category`}<a parentName="li" {...{
            "href": "#fnref-1",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-2"
        }}>{`You'll learn how to build a bag of words (BoW) classifier in LING 539`}<a parentName="li" {...{
            "href": "#fnref-2",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-3"
        }}>{`The most extreme case being `}<a parentName="li" {...{
            "href": "https://en.wikipedia.org/wiki/Hapax_legomenon",
            "target": "_self",
            "rel": "nofollow"
          }}><em parentName="a">{`hapax legomenon`}</em></a><a parentName="li" {...{
            "href": "#fnref-3",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-4"
        }}>{`Predicting which word is likely to follow`}<a parentName="li" {...{
            "href": "#fnref-4",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-5"
        }}>{`As in `}<strong parentName="li">{`A`}</strong>{`pplication for `}<strong parentName="li">{`B`}</strong>{`enefits `}<strong parentName="li">{`E`}</strong>{`ligibility`}<a parentName="li" {...{
            "href": "#fnref-5",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-6"
        }}>{`the form you'd see in the dictionary`}<a parentName="li" {...{
            "href": "#fnref-6",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-7"
        }}>{`"Wait! Is `}<em parentName="li">{`snuck`}</em><undefined parentName="li">{` really a word?" If people use it, it's a word! `}<span {...{
              "role": "img",
              "aria-label": "grinning face with smiling eyes"
            }}>{`😄`}</span></undefined><a parentName="li" {...{
            "href": "#fnref-7",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
      </ol>
    </div>
    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      