import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */
/* @jsx mdx */
import DefaultLayout from "/home/runner/work/myedibleenso.github.io/myedibleenso.github.io/src/components/BasicLayout.js";
export const _frontmatter = {};
const layoutProps = {
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">


    <h1 {...{
      "id": "overview",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#overview",
        "aria-label": "overview permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Overview`}</h1>
    <p>{`In this lesson, we'll look at a definition of `}<strong parentName="p">{`token`}</strong>{` and see some examples in English.`}</p>
    <h1 {...{
      "id": "outcomes",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#outcomes",
        "aria-label": "outcomes permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Outcomes`}</h1>
    <p>{`After completing this lesson, you should be able to ...`}</p>
    <ul>
      <li parentName="ul">{`define `}<strong parentName="li">{`token`}</strong></li>
      <li parentName="ul">{`describe the distinction between `}<strong parentName="li">{`token`}</strong>{` and `}<strong parentName="li">{`type`}</strong></li>
      <li parentName="ul">{`describe patterns for delimiting tokens in one or more natural languages`}</li>
    </ul>
    <h1 {...{
      "id": "before-starting",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#before-starting",
        "aria-label": "before starting permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Before starting`}</h1>
    <ul>
      <li parentName="ul">{`Read the following section of Manning et al.'s `}<em parentName="li">{`Introduction to Information Retrieval`}</em>{`:`}
        <ul parentName="li">
          <li parentName="ul"><a parentName="li" {...{
              "href": "https://nlp.stanford.edu/IR-book/html/htmledition/tokenization-1.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`https://nlp.stanford.edu/IR-book/html/htmledition/tokenization-1.html`}</a></li>
        </ul>
      </li>
    </ul>
    <h1 {...{
      "id": "background",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#background",
        "aria-label": "background permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Background`}</h1>
    <h2 {...{
      "id": "what-is-a-token",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#what-is-a-token",
        "aria-label": "what is a token permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a><em parentName="h2">{`What is a token?`}</em></h2>
    <p>{`In `}<a parentName="p" {...{
        "href": "https://nlp.stanford.edu/IR-book/pdf/irbookonlinereading.pdf#page=59",
        "target": "_self",
        "rel": "nofollow"
      }}>{`2.2.1 of `}<em parentName="a">{`Introduction to Information Retrieval`}</em></a>{`, Christopher D. Manning, Prabhakar Raghavan and Hinrich Schütze offer the following definition:`}</p>
    <blockquote>
      <p>
A token is an instance of a sequence of characters in some particular document that are grouped together as a useful semantic unit for processing. 
      </p>
      <footer>
  From <cite><a href="https://nlp.stanford.edu/IR-book/html/htmledition/tokenization-1.html">2.2.1 of <i>Information Retrieval</i></a></cite>
      </footer>
    </blockquote>
    <p>{`Notice that definition doesn't use `}<strong parentName="p">{`word`}</strong>{`.  It aligns with the notion of `}<a parentName="p" {...{
        "href": "https://en.wikipedia.org/wiki/Lexical_item",
        "target": "_self",
        "rel": "nofollow"
      }}><em parentName="a">{`lexical item`}</em></a>{`.  By this definition, a token could be a `}<em parentName="p">{`single`}</em>{` word, a `}<em parentName="p">{`sequence`}</em>{` of words, or just a `}<em parentName="p">{`part`}</em>{` (ex. morpheme) of a word.`}</p>
    <p>{`Let's look at an example:`}</p>
    <p><em parentName="p">{`I like Valeria's shoes.`}</em>{` `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow">{`→`}</mo></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\rightarrow`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.3669em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`→`}</span></span></span></span></span>{` `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`["I", "like", "Valeria", "'s", "shoes", "."]`}</code></p>
    <p>{`Given our definition of `}<strong parentName="p">{`token`}</strong>{`, we can `}<em parentName="p">{`tokenize`}</em>{` or split the sentence `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`I like Valeria's shoes.`}</code>{` in a number of different ways.  Here are two possibilities:`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "text"
    }}><pre parentName="div" {...{
        "className": "language-text"
      }}><code parentName="pre" {...{
          "className": "language-text"
        }}>{`["I", "like", "Valeria", "'s", "shoes", "."]
["I", "like", "Valeria's", "shoes", "."]`}</code></pre></div>
    <p>{`You might be wondering which representation is "correct" or preferable.  As we'll see in later lessons, different problems require different solutions. `}</p>
    <p>{`There are, however, some general considerations:`}</p>
    <ul>
      <li parentName="ul"><strong parentName="li">{`compatibility with existing tools`}</strong>{`: statistical models for part of speech tagging, parsing, etc. are trained to expect a certain tokenization strategy.  If you're using an off-the-shelf tool (ex. POS tagger) to process data you've tokenized with a strategy unfamiliar to the model`}<sup parentName="li" {...{
          "id": "fnref-1"
        }}><a parentName="sup" {...{
            "href": "#fn-1",
            "className": "footnote-ref"
          }}>{`1`}</a></sup>{`, performance is likely to be poor.`}</li>
    </ul>
    <ul>
      <li parentName="ul">
        <p parentName="li"><strong parentName="p">{`extending or supplementing existing datasets`}</strong>{`: if you want to supplement some existing dataset, you probably want your tokenization strategy to match.`}</p>
      </li>
      <li parentName="ul">
        <p parentName="li"><strong parentName="p">{`size`}</strong>{`: different tokenization strategies can increase or decrease the number of unique tokens encountered. When we learn about classification tasks, this will become important.`}</p>
      </li>
    </ul>
    <p>{`In a very large corpus, which of the two tokenization strategies presented above will results in `}<em parentName="p">{`fewer`}</em>{` unique tokens (i.e., a more compact set of tokens)?`}</p>
    <h1 {...{
      "id": "types-vs-tokens",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#types-vs-tokens",
        "aria-label": "types vs tokens permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Types vs tokens`}</h1>
    <p>{`If tokens are roughly lexical items, then we can think of a token as an individual occurrence of a lexical item.  For example, the word "the" may occur many times in a document.`}<sup parentName="p" {...{
        "id": "fnref-2"
      }}><a parentName="sup" {...{
          "href": "#fn-2",
          "className": "footnote-ref"
        }}>{`2`}</a></sup>{`  `}</p>
    <p>{`Each of those occurrences is a `}<em parentName="p">{`token`}</em>{`.`}</p>
    <p>{`We use `}<strong parentName="p">{`type`}</strong>{` to reference the category represented by all of the occurrences of a single lexical item (ex. occurrences of "the" treated as a category).`}</p>
    <ul>
      <li parentName="ul">{`Identical tokens constitute a single `}<strong parentName="li">{`type`}</strong>{`. `}</li>
      <li parentName="ul">{`The set of types or vocabulary of some text is its unique set of tokens. `}</li>
      <li parentName="ul">{`The `}<strong parentName="li">{`type`}</strong>{` vs `}<strong parentName="li">{`token`}</strong>{` distinction is that of unique vs non-unique strings.`}</li>
    </ul>
    <p>{`Let's look at an example:`}</p>
    <p>{`"his friends are also his enemies" `}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}>{`tokens `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"his"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"friends"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"are"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"also"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"his"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"enemies"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span>{`
types  `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`{`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"his"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"friends"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"are"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"also"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"enemies"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`}`}</span></code></pre></div>
    <h1 {...{
      "id": "tokens-in-english",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#tokens-in-english",
        "aria-label": "tokens in english permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Tokens in English`}</h1>
    <p>{`If you examine tokenized English datasets, `}<a parentName="p" {...{
        "href": "https://universaldependencies.org/treebanks/en-comparison.html",
        "target": "_self",
        "rel": "nofollow"
      }}>{`such as those made available by the Universal Dependencies`}</a>{`, you'll notice that ...`}</p>
    <ul>
      <li parentName="ul">{`tokens don't contain whitespace`}</li>
      <li parentName="ul">{`possessive markers are split into separate tokens`}
        <ul parentName="li">
          <li parentName="ul">{`ex. `}<code parentName="li" {...{
              "className": "language-text"
            }}>{`"Iggy's"`}</code>{` `}<span parentName="li" {...{
              "className": "math math-inline"
            }}><span parentName="span" {...{
                "className": "katex"
              }}><span parentName="span" {...{
                  "className": "katex-mathml"
                }}><math parentName="span" {...{
                    "xmlns": "http://www.w3.org/1998/Math/MathML"
                  }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow">{`→`}</mo></mrow><annotation parentName="semantics" {...{
                        "encoding": "application/x-tex"
                      }}>{`\\rightarrow`}</annotation></semantics></math></span><span parentName="span" {...{
                  "className": "katex-html",
                  "aria-hidden": "true"
                }}><span parentName="span" {...{
                    "className": "base"
                  }}><span parentName="span" {...{
                      "className": "strut",
                      "style": {
                        "height": "0.3669em"
                      }
                    }}></span><span parentName="span" {...{
                      "className": "mrel"
                    }}>{`→`}</span></span></span></span></span>{` `}<code parentName="li" {...{
              "className": "language-text"
            }}>{`["Iggy", "'s"]`}</code></li>
        </ul>
      </li>
      <li parentName="ul">{`clitic negations are split into separate tokens`}
        <ul parentName="li">
          <li parentName="ul">{`ex. `}<code parentName="li" {...{
              "className": "language-text"
            }}>{`"shouldn't"`}</code>{` `}<span parentName="li" {...{
              "className": "math math-inline"
            }}><span parentName="span" {...{
                "className": "katex"
              }}><span parentName="span" {...{
                  "className": "katex-mathml"
                }}><math parentName="span" {...{
                    "xmlns": "http://www.w3.org/1998/Math/MathML"
                  }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow">{`→`}</mo></mrow><annotation parentName="semantics" {...{
                        "encoding": "application/x-tex"
                      }}>{`\\rightarrow`}</annotation></semantics></math></span><span parentName="span" {...{
                  "className": "katex-html",
                  "aria-hidden": "true"
                }}><span parentName="span" {...{
                    "className": "base"
                  }}><span parentName="span" {...{
                      "className": "strut",
                      "style": {
                        "height": "0.3669em"
                      }
                    }}></span><span parentName="span" {...{
                      "className": "mrel"
                    }}>{`→`}</span></span></span></span></span>{` `}<code parentName="li" {...{
              "className": "language-text"
            }}>{`["should", "n't"]`}</code></li>
        </ul>
      </li>
      <li parentName="ul">{`acronyms are `}<strong parentName="li">{`not`}</strong>{` split`}
        <ul parentName="li">
          <li parentName="ul">{`ex. `}<code parentName="li" {...{
              "className": "language-text"
            }}>{`"D.C."`}</code>{` -> `}<code parentName="li" {...{
              "className": "language-text"
            }}>{`"D.C."`}</code></li>
        </ul>
      </li>
    </ul>
    <p>{`For further guidance, see `}<a parentName="p" {...{
        "href": "https://universaldependencies.org/en/index.html",
        "target": "_self",
        "rel": "nofollow"
      }}>{`https://universaldependencies.org/en/index.html`}</a>{`.`}</p>
    <h1 {...{
      "id": "next-steps",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#next-steps",
        "aria-label": "next steps permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Next steps`}</h1>
    <h2 {...{
      "id": "practice",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#practice",
        "aria-label": "practice permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Practice`}</h2>
    <ul>
      <li parentName="ul">{`Give an example of a `}<strong parentName="li">{`token`}</strong>{` that is not a `}<strong parentName="li">{`word`}</strong></li>
      <li parentName="ul">{`Consider a non-English language that you've studied.  Alternatively, search for a sample of writing in non-English language.  `}
        <ul parentName="li">
          <li parentName="ul">{`Does the language use whitespace to delimit word boundaries?  Are there other features that appear useful for delimiting words?`}</li>
        </ul>
      </li>
      <li parentName="ul">{`Give one reason why splitting `}<code parentName="li" {...{
          "className": "language-text"
        }}>{`"couldn't"`}</code>{` as `}<code parentName="li" {...{
          "className": "language-text"
        }}>{`["could", "n't"]`}</code>{` is preferable to `}<code parentName="li" {...{
          "className": "language-text"
        }}>{`["couldn", "'t"]`}</code></li>
    </ul>

    <div {...{
      "className": "footnotes"
    }}>
      <hr parentName="div"></hr>
      <ol parentName="div">
        <li parentName="ol" {...{
          "id": "fn-1"
        }}>{`"unfamiliar" here means data that does not match or resemble the model's training data (ex. the two datasets were processed using different tokenization strategies).`}<a parentName="li" {...{
            "href": "#fnref-1",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-2"
        }}>{`To understand why, `}<a parentName="li" {...{
            "href": "/tutorials/parts-of-speech"
          }}>{`review the parts of speech lesson`}</a>{`.  `}<a parentName="li" {...{
            "href": "#fnref-2",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
      </ol>
    </div>
    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      