import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */
/* @jsx mdx */
import DefaultLayout from "/home/runner/work/myedibleenso.github.io/myedibleenso.github.io/src/components/BasicLayout.js";
export const _frontmatter = {};
const layoutProps = {
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">


    <h1 {...{
      "id": "overview",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#overview",
        "aria-label": "overview permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Overview`}</h1>
    <p>{`This is a brief look at tokenization and part of speech assignment in Japanese.`}</p>
    <h1 {...{
      "id": "outcomes",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#outcomes",
        "aria-label": "outcomes permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Outcomes`}</h1>
    <ul>
      <li parentName="ul"><strong parentName="li">{`examine`}</strong>{` linguistic data and `}<strong parentName="li">{`identify`}</strong>{` differences in how tokens are defined  across languages (ex. Japanese vs. English)`}</li>
      <li parentName="ul"><strong parentName="li">{`examine`}</strong>{` linguistic data and `}<strong parentName="li">{`identify`}</strong>{` differences in how parts of speech are assigned across languages (ex. Japanese vs. English)`}</li>
    </ul>
    <h2 {...{
      "id": "prerequisites",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#prerequisites",
        "aria-label": "prerequisites permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Prerequisites`}</h2>
    <p>{`Before starting this tutorial, ensure that ...`}</p>
    <ul>
      <li parentName="ul">{`you are `}<a parentName="li" {...{
          "href": "/tutorials/tokens"
        }}>{`familiar with tokens and types`}</a></li>
      <li parentName="ul">{`you are `}<a parentName="li" {...{
          "href": "/tutorials/parts-of-speech"
        }}>{`familiar with parts of speech`}</a></li>
    </ul>
    <h1 {...{
      "id": "background",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#background",
        "aria-label": "background permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Background`}</h1>
    <p>{`This section provides a (very) brief linguistic overview of Japanese.`}</p>
    <h2 {...{
      "id": "language-classification",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#language-classification",
        "aria-label": "language classification permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Language classification`}</h2>
    <ul>
      <li parentName="ul"><strong parentName="li">{`Family`}</strong>{`: Japanese is a member of the `}<a parentName="li" {...{
          "href": "https://en.wikipedia.org/wiki/Japonic_languages",
          "target": "_self",
          "rel": "nofollow"
        }}>{`Japonic languages family`}</a></li>
      <li parentName="ul"><strong parentName="li">{`Morphological typology`}</strong>{`: `}<a parentName="li" {...{
          "href": "http://www.kanji.org/kanji/japanese/writing/wordform.htm",
          "target": "_self",
          "rel": "nofollow"
        }}>{`The Japanese language is `}<em parentName="a">{`agglutinative`}</em></a>{`. Increasingly complex words are formed by combining morphemes.  The elements of these new words largely retain their original forms and meanings.  `}</li>
    </ul>
    <h2 {...{
      "id": "word-order",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#word-order",
        "aria-label": "word order permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Word order`}</h2>
    <p>{`Canonical word order in Japanese is SOV (subject - object - verb):`}</p>
    <p>{`  私は寿司を食べます  `}</p>
    <table>
      <thead parentName="table">
        <tr parentName="thead">
          <th parentName="tr" {...{
            "align": "center"
          }}>{`S`}</th>
          <th parentName="tr" {...{
            "align": "center"
          }}>{`O`}</th>
          <th parentName="tr" {...{
            "align": "center"
          }}>{`V`}</th>
        </tr>
      </thead>
      <tbody parentName="table">
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": "center"
          }}>{`私は`}</td>
          <td parentName="tr" {...{
            "align": "center"
          }}>{`寿司を`}</td>
          <td parentName="tr" {...{
            "align": "center"
          }}>{`食べます`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": "center"
          }}><a parentName="td" {...{
              "href": "https://en.wikipedia.org/wiki/Japanese_grammar#Grammatical_case",
              "target": "_self",
              "rel": "nofollow"
            }}>{`1SG-NOM`}</a></td>
          <td parentName="tr" {...{
            "align": "center"
          }}>{`sushi-`}<a parentName="td" {...{
              "href": "https://en.wikipedia.org/wiki/Japanese_grammar#Grammatical_case",
              "target": "_self",
              "rel": "nofollow"
            }}>{`ACC`}</a></td>
          <td parentName="tr" {...{
            "align": "center"
          }}>{`eat`}</td>
        </tr>
      </tbody>
    </table>
    <p><sup parentName="p" {...{
        "id": "fnref-1"
      }}><a parentName="sup" {...{
          "href": "#fn-1",
          "className": "footnote-ref"
        }}>{`1`}</a></sup></p>
    <p>{`  'I eat sushi'`}</p>
    <h2 {...{
      "id": "written-language",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#written-language",
        "aria-label": "written language permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Written language`}</h2>
    <ul>
      <li parentName="ul">{`No whitespace between words used in writing`}
        <ul parentName="li">
          <li parentName="ul">{`there is no clear word boundary like English (see the previous example)`}</li>
        </ul>
      </li>
    </ul>
    <ul>
      <li parentName="ul">{`Mixed writing system`}
        <ul parentName="li">
          <li parentName="ul"><a parentName="li" {...{
              "href": "https://en.wikipedia.org/wiki/Kanji",
              "target": "_self",
              "rel": "nofollow"
            }}>{`kanji`}</a>
            <ul parentName="li">
              <li parentName="ul">{`logographic system of approximately 50,000 characters`}</li>
              <li parentName="ul">{`Example: `}<strong parentName="li">{`食`}</strong>{` in `}<strong parentName="li">{`食`}</strong>{`べます ('eat')`}</li>
            </ul>
          </li>
          <li parentName="ul"><a parentName="li" {...{
              "href": "https://en.wikipedia.org/wiki/Hiragana",
              "target": "_self",
              "rel": "nofollow"
            }}>{`hiragana`}</a>
            <ul parentName="li">
              <li parentName="ul">{`syllabary for native words; all `}<em parentName="li">{`kanji`}</em>{` can also be written (ambiguously) in `}<em parentName="li">{`hiragana`}</em></li>
              <li parentName="ul">{`Example: `}<strong parentName="li">{`べ`}</strong>{` (IPA: /be/), `}<strong parentName="li">{`ま`}</strong>{` (/ma/), and `}<strong parentName="li">{`す`}</strong>{` (/su/) in 食`}<strong parentName="li">{`べます`}</strong>{` ('eat')`}</li>
            </ul>
          </li>
          <li parentName="ul"><a parentName="li" {...{
              "href": "https://en.wikipedia.org/wiki/Katakana",
              "target": "_self",
              "rel": "nofollow"
            }}>{`katakana`}</a>
            <ul parentName="li">
              <li parentName="ul">{`syllabary primarily used for foreign terms, loanwords, onomatopaeic expressions, and emphasis`}</li>
              <li parentName="ul">{`Example: `}<strong parentName="li">{`ア`}</strong>{` (/a/), `}<strong parentName="li">{`イ`}</strong>{` (/i/), and `}<strong parentName="li">{`ス`}</strong>{` (/su/) in `}<strong parentName="li">{`アイス`}</strong>{` ('ice cream' /aisu/)`}</li>
            </ul>
          </li>
        </ul>
      </li>
    </ul>
    <h1 {...{
      "id": "how-is-it-annotated",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#how-is-it-annotated",
        "aria-label": "how is it annotated permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`How is it annotated?`}</h1>
    <p><a parentName="p" {...{
        "href": "https://universaldependencies.org/treebanks/ja-comparison.html",
        "target": "_self",
        "rel": "nofollow"
      }}>{`There are a number of annotated corpora available for Japanese that include part of speech information`}</a>{`.  Let's examine how part of speech tags are assigned according to the universal dependencies annotation guidelines for the Japanese language...`}</p>
    <h2 {...{
      "id": "universal-pos-upos-categorization",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#universal-pos-upos-categorization",
        "aria-label": "universal pos upos categorization permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Universal POS (UPOS) categorization`}</h2>
    <p>{`The Universal Dependency (UD) annotation guidelines for assigning part of speech tags for Japanese are described in the following paper:`}</p>
    <ul>
      <li parentName="ul"><a parentName="li" {...{
          "href": "https://www.aclweb.org/anthology/L16-1261.pdf#page=2",
          "target": "_self",
          "rel": "nofollow"
        }}>{`Overview of Universal Dependencies for Japanese`}</a></li>
    </ul>
    <table>
      <thead parentName="table">
        <tr parentName="thead">
          <th parentName="tr" {...{
            "align": null
          }}>{`POS Tag`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`Explanations`}</th>
        </tr>
      </thead>
      <tbody parentName="table">
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/ADJ.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`ADJ`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Adjective`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/ADV.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`ADV`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Adverb`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/INTJ.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`INTJ`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Interjection`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/NOUN.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`NOUN`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Noun (except for nouns that can be used as VERB or ADJ)`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/PROPN.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`PROPN`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Proper noun`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/VERB.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`VERB`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Verb (ex. 食べ "eat", 食事 "meal")`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/ADP.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`ADP`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Case particles (が `}<em parentName="td">{`ga`}</em>{`),　adverbial particles (しか shika "only"), binding particles　（は `}<em parentName="td">{`wa`}</em>{`）`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/AUX.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`AUX`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Auxiliary verb (食べ'た' "ate")`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/CCONJ.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`CCONJ`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Conjunctions (また"also")`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/DET.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`DET`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Adnominal (あの"that")`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/NUM.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`NUM`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Number`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/PART.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`PART`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Sentence-ending particles (か, "?", ), Suffix (本格'的' "genuine")`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/PRON.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`PRON`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Pronoun`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/SCONJ.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`SCONJ`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Conjunction particles (行っ'て' "go, and `}{`[...]`}{`"),  particle that attaches to a phrase and acts on the whole phrase (行く'の' "going")`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/PUNCT.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`PUNCT`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Punctuation`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/SYM.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`SYM`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Any punctuation except for PUNCT and X`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://universaldependencies.org/ja/pos/ADJ.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`X`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Blank`}</td>
        </tr>
      </tbody>
    </table>
    <h2 {...{
      "id": "example-sentence",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#example-sentence",
        "aria-label": "example sentence permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Example Sentence`}</h2>
    <p>{`蔵前国技館跡地は東京都に売却し、その売却益が...`}</p>
    <table>
      <thead parentName="table">
        <tr parentName="thead">
          <th parentName="tr" {...{
            "align": null
          }}>{`Original`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`蔵前`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`国技`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`館`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`跡地`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`は`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`東京`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`都`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`に`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`売却`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`し`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`、`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`その`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`売却`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`益`}</th>
          <th parentName="tr" {...{
            "align": null
          }}>{`が`}</th>
        </tr>
      </thead>
      <tbody parentName="table">
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Romanization`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Kuramae`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`kokugi`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`kan`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`atochi`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`wa`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Tokyo`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`to`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`ni`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`baikyaku`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`shi`}</td>
          <td parentName="tr" {...{
            "align": null
          }}></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`sono`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`baikyaku`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`eki`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`ga`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://en.wikipedia.org/wiki/List_of_glossing_abbreviations",
              "target": "_self",
              "rel": "nofollow"
            }}>{`Gloss`}</a></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Kuramae`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`national sport`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`building`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`site`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`(topic particle)`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Tokyo`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`capital`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`(direction particle)`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`sell`}</td>
          <td parentName="tr" {...{
            "align": null
          }}></td>
          <td parentName="tr" {...{
            "align": null
          }}></td>
          <td parentName="tr" {...{
            "align": null
          }}>{`that`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`selling`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`benefit`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`(object particle))`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Tag`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`PROPN`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`NOUN`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`NOUN`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`NOUN`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`ADP`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`PROPN`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`NOUN`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`ADP`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`VERB`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`AUX`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`PUNCT`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`DET`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`NOUN`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`NOUN`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`ADP`}</td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Detailed tag`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Proper noun`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Common noun`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Suffix`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Common noun`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`binding particle`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Proper noun`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Common noun`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`case particle`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Verb ("-suru" can be attached)`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Verb (cannot stand alone)`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Punctuation - comma`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Adnominal`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Noun ("-suru" can be attached)`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Noun ("-suru" can be attached)`}</td>
          <td parentName="tr" {...{
            "align": null
          }}>{`Particle (case particle)`}</td>
        </tr>
      </tbody>
    </table>
    <p><a parentName="p" {...{
        "href": "https://universaldependencies.org/treebanks/ja_bccwj",
        "target": "_self",
        "rel": "nofollow"
      }}>{`sent_id = dev-s49`}</a></p>
    <h3 {...{
      "id": "comment",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#comment",
        "aria-label": "comment permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Comment`}</h3>
    <p>{`Note that the UD annotation scheme for japanese treats the tensed element of a verb (past tense marker `}<em parentName="p">{`ta`}</em>{`) is treated as a separate token from the verb root (ex. 食べた ('ate') `}{`→`}{` `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`["食べ", "た"]`}</code>{`).`}</p>
    <p>{`Verbal nouns are nouns that function as verbs when followed by an auxiliary verb (e.g. する `}<em parentName="p">{`suru`}</em>{` meaning "do"). The stems of verbal nouns are tagged as verbs.  For example, 売却 `}<em parentName="p">{`baikyaku`}</em>{` ('sale') is tagged as VERB in the first instance in the previous example.  As the second instance of 売却 (`}<em parentName="p">{`baikyaku`}</em>{`) shows it as NOUN.  Adjectival verbs.`}</p>
    <h1 {...{
      "id": "text-normalization",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#text-normalization",
        "aria-label": "text normalization permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Text normalization`}</h1>
    <p>{`Japanese does not have an upper case / lower case distinction.  Emphasis is conveyed through other means (ex. use of katakana).  Example strategies for text normalization in Japanese include ...`}</p>
    <ul>
      <li parentName="ul">{`Spelling correction (standardization) and replacement of repetitions`}
        <ul parentName="li">
          <li parentName="ul">{`すげえええええええええ！！！ ('awesomeeeeeeeee') `}<span parentName="li" {...{
              "className": "math math-inline"
            }}><span parentName="span" {...{
                "className": "katex"
              }}><span parentName="span" {...{
                  "className": "katex-mathml"
                }}><math parentName="span" {...{
                    "xmlns": "http://www.w3.org/1998/Math/MathML"
                  }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow">{`→`}</mo></mrow><annotation parentName="semantics" {...{
                        "encoding": "application/x-tex"
                      }}>{`\\rightarrow`}</annotation></semantics></math></span><span parentName="span" {...{
                  "className": "katex-html",
                  "aria-hidden": "true"
                }}><span parentName="span" {...{
                    "className": "base"
                  }}><span parentName="span" {...{
                      "className": "strut",
                      "style": {
                        "height": "0.3669em"
                      }
                    }}></span><span parentName="span" {...{
                      "className": "mrel"
                    }}>{`→`}</span></span></span></span></span>{`　すごい！ `}</li>
        </ul>
      </li>
      <li parentName="ul">{`Replacing rare kanji sequences with hiragana`}
        <ul parentName="li">
          <li parentName="ul">{`The hiragana syllabary is a more compact vocabulary of symbols than kanji`}</li>
        </ul>
      </li>
    </ul>
    <h1 {...{
      "id": "challenges",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#challenges",
        "aria-label": "challenges permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Challenges`}</h1>
    <h2 {...{
      "id": "tokenization-challenges",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#tokenization-challenges",
        "aria-label": "tokenization challenges permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Tokenization challenges`}</h2>
    <ul>
      <li parentName="ul">{`No whitespace between words`}</li>
      <li parentName="ul"><strong parentName="li">{`Mixed writing system`}</strong>{`: Use of katakana syllabary for emphasis or other stylistic reasons can make segmentation difficult`}</li>
      <li parentName="ul"><strong parentName="li">{`agglutinative`}</strong>{`: long compounds and uncommon `}<em parentName="li">{`kanji`}</em>{` can pose a challenge to segmentation strategies`}</li>
    </ul>
    <h2 {...{
      "id": "pos-tagging-challenges",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#pos-tagging-challenges",
        "aria-label": "pos tagging challenges permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`POS tagging challenges`}</h2>
    <p>{`While certainly not unique to Japanese, ambiguity is a challenge for POS assignment.  Instances of the same string may be assigned different POS tags in different contexts.`}</p>
    <ul>
      <li parentName="ul">{`For example, the particle の `}<em parentName="li">{`no`}</em>{` can be a case marker, sentence-final particle, or a nominalizer`}</li>
    </ul>
    <h1 {...{
      "id": "misc",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#misc",
        "aria-label": "misc permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Misc`}</h1>
    <p><strong parentName="p">{`NOTE: `}<em parentName="strong">{`Though it highlights unique linguistics challenges, this section is not strictly related to the Unit and is therefore optional.`}</em></strong></p>
    <h1 {...{
      "id": "digitization-and-handwriting-recognition-challenges",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#digitization-and-handwriting-recognition-challenges",
        "aria-label": "digitization and handwriting recognition challenges permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Digitization and handwriting recognition challenges`}</h1>
    <p>{`The digitization of historic Japanese texts faces numerous challenges.`}</p>
    <p>{`In addition to having a mixed writing system, the orientation of written text in Japanese is highly variable.  `}</p>
    <p>{`Japanese text can appear...`}</p>
    <ul>
      <li parentName="ul">{`left-to-right (LTR) horizontally`}</li>
    </ul>
    <p><figure parentName="p" {...{
        "className": "gatsby-resp-image-figure",
        "style": {}
      }}>{`
    `}<span parentName="figure" {...{
          "className": "gatsby-resp-image-wrapper",
          "style": {
            "position": "relative",
            "display": "block",
            "marginLeft": "auto",
            "marginRight": "auto",
            "maxWidth": "590px"
          }
        }}>{`
      `}<a parentName="span" {...{
            "className": "gatsby-resp-image-link",
            "href": "/static/8cbf7abbfc3b6f657ffc6353c775709a/a6d36/jpn-left-to-right.png",
            "style": {
              "display": "block"
            },
            "target": "_blank",
            "rel": "noopener"
          }}>{`
    `}<span parentName="a" {...{
              "className": "gatsby-resp-image-background-image",
              "style": {
                "paddingBottom": "39.189189189189186%",
                "position": "relative",
                "bottom": "0",
                "left": "0",
                "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAICAYAAAD5nd/tAAAACXBIWXMAAAsTAAALEwEAmpwYAAABl0lEQVQoz22R2Y7TMABF8/+fxBMgpBGrBoYh0zZu1mZxnL1Jm8U+KEYjhIT9cKxr++ra1/n84YGHt+9ZhhE2g26voA3/G8YYtDGW/6y1wU5jcMJP3xEP34i//iT78ULz7pFO1fRDzzAMKFVR1zXjOKK1xmiN1n9NX423+4KZF5x+vLKhUU1D3XXc1plxmhiuN6bpZs1K1dC0HfO8MK+GZVmY55n7/c58n9HALBv6S4NzOByRZU2lSm7TRJpKslzSdBPD0FPXFVl5JYwuVo/znjhOiKKIIAjxzz6qqckOPt5TinMSAX4kKVXNi9/ydKzwxJksTQgSRSEVbaPwPA/XdTkLD9/3SdPUMo5jRBDgPR5okgonyzLqStknniNJGCWUpaRrW2Q1cZETeVGiVIksS4pCWrNCSoqioKoq2qEnes5In3OcPM9tdCEEURhYns8+QRBYYz+u8MMMITxbUJIk/HJdm/Z4OiE8j7KtCb+4iDcf/yQMw4ide4vbtrGuq/34nXpbrb4X8Lq3FzIvi9X2c3rb760s28pv2ohbjlIa7vAAAAAASUVORK5CYII=')",
                "backgroundSize": "cover",
                "display": "block"
              }
            }}></span>{`
  `}<img parentName="a" {...{
              "className": "gatsby-resp-image-image",
              "alt": "left to right horizontally",
              "title": "An example of horizontal LTR text from the Japanese Wikipedia page on the Japanese language",
              "src": "/static/8cbf7abbfc3b6f657ffc6353c775709a/fcda8/jpn-left-to-right.png",
              "srcSet": ["/static/8cbf7abbfc3b6f657ffc6353c775709a/12f09/jpn-left-to-right.png 148w", "/static/8cbf7abbfc3b6f657ffc6353c775709a/e4a3f/jpn-left-to-right.png 295w", "/static/8cbf7abbfc3b6f657ffc6353c775709a/fcda8/jpn-left-to-right.png 590w", "/static/8cbf7abbfc3b6f657ffc6353c775709a/a6d36/jpn-left-to-right.png 650w"],
              "sizes": "(max-width: 590px) 100vw, 590px",
              "style": {
                "width": "100%",
                "height": "100%",
                "margin": "0",
                "verticalAlign": "middle",
                "position": "absolute",
                "top": "0",
                "left": "0"
              },
              "loading": "lazy",
              "decoding": "async"
            }}></img>{`
  `}</a>{`
    `}</span>{`
    `}<figcaption parentName="figure" {...{
          "className": "gatsby-resp-image-figcaption"
        }}>{`An example of horizontal LTR text from the Japanese Wikipedia page on the Japanese language`}</figcaption>{`
  `}</figure><sup parentName="p" {...{
        "id": "fnref-2"
      }}><a parentName="sup" {...{
          "href": "#fn-2",
          "className": "footnote-ref"
        }}>{`2`}</a></sup></p>
    <ul>
      <li parentName="ul">{`right-to-left (RTL) and top to bottom (TTB) (ex. newspapers)`}</li>
    </ul>
    <p>{`and even ...`}</p>
    <ul>
      <li parentName="ul">{`left-to-right horizontally (uncommon and old style)
`}<figure parentName="li" {...{
          "className": "gatsby-resp-image-figure",
          "style": {}
        }}>{`
    `}<span parentName="figure" {...{
            "className": "gatsby-resp-image-wrapper",
            "style": {
              "position": "relative",
              "display": "block",
              "marginLeft": "auto",
              "marginRight": "auto",
              "maxWidth": "383px"
            }
          }}>{`
      `}<a parentName="span" {...{
              "className": "gatsby-resp-image-link",
              "href": "/static/4f1f284f690dfb37ff97cc2685261007/d0c94/jpn-vertical-right-left.png",
              "style": {
                "display": "block"
              },
              "target": "_blank",
              "rel": "noopener"
            }}>{`
    `}<span parentName="a" {...{
                "className": "gatsby-resp-image-background-image",
                "style": {
                  "paddingBottom": "109.45945945945948%",
                  "position": "relative",
                  "bottom": "0",
                  "left": "0",
                  "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAAWCAYAAADAQbwGAAAACXBIWXMAAAsTAAALEwEAmpwYAAAETUlEQVQ4y21VayxkZxg+vwSVuCdN+IHwxy2iEkF1JUSz2ojYZbTVNCwixHVJLKH8IfyQinsESwjdRmJFdRmXGWNd4hLqVkPZMMNqmXEZBjPjab6XM1h9k+985/vOe57ved/nfc/htBoNmKn21rDRHIsPY53YkPyG3dleQLGM/fk/aGxPdUMq7sTpmgj721JIN/7G+vo6pFIp5HI5Yeh0OnDswmz7wxbiBCH43MoMP/4QhZ+iv0dU5DPk5+VgWNiP6O8E+Pbp1xiXiPFNyFN84eWF7Oxs8HZ9fU2DYxceNCevAEbGn6H59WsIBFF49SoXMrkck1NTePIkAF/6f4Vf37xBbOwLCAQCBAQEPAbkwVZXV/HyZSbGx8eRmpqKlJQUxMbGIi0tDRUVFSgsLMTz589QXFyM+vp61NbWIisr6wHYA8CrqyvaYKbVaml9fHyM09NTqNVqms/OzvTRfMqMv+fub/Bg940BLywsYHZmGpL3k5j86+TWT0fg998lwE9P4gfPRKVSYWlpCXOzMxgZHcfwn8e3it74MAKPGP4fIG8KhYLyursrh3zvHyzu3Byk1T1k9giQZ7S5uYmtrS2cn59TfSmVR1haXqFnB6dA+8QVdnd3af32bQ/Kysoelw0PqlQqsby8TKCMmUgkhur4EP9+3L4J/xL4pe8IQuEg1Opz8mXqy2Syu8K+DxgaGgo/Pz/Mzs7i4uISg8J3eDcuhWjpDDLFNbaVwNbOPt5LxFAolejr60N8fDwGBwf1gnL3Q/Xy8oKhoSHCwsJQUPAzREP9mJGxMLWQK6+x/lELtQ6QyXaQkJAAJycnGBgYoLy8nDA0Gg04zW0vt7W1wcTEBJaWlnB2doa1tTV6e3+/TTvz4UXQUT2Gh4fDw8OD/EtKSu4Y8nUnFAphZWVFoDY2NggODqbuUanO9GqenJxienoal5eXJFhMTAz5NzY23jHkAY+OjuDr6wt3d3ckJSWhurqaXt7b26PBAHZ2dtDa2oru7m709/dTJKampkhOTr5jyJ/OFA4KCkJRURH1a1xcHHJycjAxMYHh4WE0NTVBIpHAx8eHDvX09ISDgwOF7O3trReWE4vFJD9DZ46sDHJzc0mYiIgIyu3Y2Bjtt7S0wMXFhcDMzc1hZmZGgG5ublRmBCgSibC4uIjDw0P6ulRVVSE6OpqYsBdYiMzYvLa2hpCQEPpsGRsbw9bWFvb29nB1ddUXO8faam5ujhaMLVMsIyMD/v7+sLOz05dEc3MzDg4OkJiYiMDAQBLD0dERFhYWxJBpQIAjIyMYHR2lBZszMzPR1dWFlZUVSgWfbJbDi4sLyiOrV47jYGRkRDNb818mjnXF/Pw8bTAVWW92dnbSPZ8XZg0NDSQc+4+UlpYiLy+PfgGRkZHUMbxxzJHlbWhoCB0dHcjPz6ewKysrUVdXBxYB82GlkZ6ejpqaGrS3t1NJDQwM0OGshnt6egj4P7r1cypb8aGQAAAAAElFTkSuQmCC')",
                  "backgroundSize": "cover",
                  "display": "block"
                }
              }}></span>{`
  `}<img parentName="a" {...{
                "className": "gatsby-resp-image-image",
                "alt": "left to right horizontally",
                "title": "A drink advertisement containing both horizontal RTL and TTB-RTL text",
                "src": "/static/4f1f284f690dfb37ff97cc2685261007/d0c94/jpn-vertical-right-left.png",
                "srcSet": ["/static/4f1f284f690dfb37ff97cc2685261007/12f09/jpn-vertical-right-left.png 148w", "/static/4f1f284f690dfb37ff97cc2685261007/e4a3f/jpn-vertical-right-left.png 295w", "/static/4f1f284f690dfb37ff97cc2685261007/d0c94/jpn-vertical-right-left.png 383w"],
                "sizes": "(max-width: 383px) 100vw, 383px",
                "style": {
                  "width": "100%",
                  "height": "100%",
                  "margin": "0",
                  "verticalAlign": "middle",
                  "position": "absolute",
                  "top": "0",
                  "left": "0"
                },
                "loading": "lazy",
                "decoding": "async"
              }}></img>{`
  `}</a>{`
    `}</span>{`
    `}<figcaption parentName="figure" {...{
            "className": "gatsby-resp-image-figcaption"
          }}>{`A drink advertisement containing both horizontal RTL and TTB-RTL text`}</figcaption>{`
  `}</figure><sup parentName="li" {...{
          "id": "fnref-3"
        }}><a parentName="sup" {...{
            "href": "#fn-3",
            "className": "footnote-ref"
          }}>{`3`}</a></sup></li>
    </ul>
    <p>{`Older styles of cursive script bear little resemblance to modern handwritten Japanese:`}</p>
    <ul>
      <li parentName="ul"><em parentName="li">{`kuzushiji`}</em>{` (old cursive style)
`}<figure parentName="li" {...{
          "className": "gatsby-resp-image-figure",
          "style": {}
        }}>{`
    `}<span parentName="figure" {...{
            "className": "gatsby-resp-image-wrapper",
            "style": {
              "position": "relative",
              "display": "block",
              "marginLeft": "auto",
              "marginRight": "auto",
              "maxWidth": "483px"
            }
          }}>{`
      `}<a parentName="span" {...{
              "className": "gatsby-resp-image-link",
              "href": "/static/8e006bc7e38b5eca7cf30f51f07da335/77a9e/jpn-kuzushiji.png",
              "style": {
                "display": "block"
              },
              "target": "_blank",
              "rel": "noopener"
            }}>{`
    `}<span parentName="a" {...{
                "className": "gatsby-resp-image-background-image",
                "style": {
                  "paddingBottom": "66.89189189189189%",
                  "position": "relative",
                  "bottom": "0",
                  "left": "0",
                  "backgroundImage": "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABQAAAANCAYAAACpUE5eAAAACXBIWXMAAAsTAAALEwEAmpwYAAADKklEQVQ4yx2TO3PiBhRG9T9SuMpM0qZLk0mRIjMpUqTZSSZ+rwkCzBsZkASIpwTogXiZN+Zhs9is7V2bTbEz+WcnY1W3O/eb754r/Pbrz6jJEOnwBdnLc+TwOYWYDyV4hKXFeLlt8zg22F1rrOwsk3qKXjmOmw/RK0Vo5YK0NZHTP37h4OAbhJEhMalfMWuk2LhZ7lyVTVfjtlVgaiv893HIft7gaVhi62a5aSQZViJ0837srI9WwU+7EEQL/8X33x4grLsaUytHQxaZNlIsLIXrapJNt8Cmo7JfWbxMazyNy9y1VUbVBJYiegkdVaStXeJqIfTkGT/+8B3CoBankvLxNKrysa+xbcs4Ob83n4car0uL10WT3bDMraswqEa9RG4+iJk+QU+eYOeC2GqAw99/QphbCmND4nFS5WWms3DegEH6pQhfFgZfNi2+btssbIVxQ6JfDDEoRzAz76nFDtGTR96CbilGLvgOYWpeUckE2PY07vt59jc6vWqCWTPN47jEp0WT16XJp5nObVtlVk/S1YLYig89eUwteYyt+DFlP7p0gbDpyqxclX+XDfZznQ+9HHpWZDfQeB4V2a9MPi8cnmcN7joqs2YKNxegJp3R1QK08iL1jI96VqSSOEfoFGMYGZEbR/HSLZwsthbGLUbZ9Qt8Xhi8rls8DMrMmhKjapRWLoApX+Dm/JiqD1P20VQCFOOnCHNTZqRf8TYXTsaDrVsqK1dhZkrs1yZfH/o8TQ3WjszUSNIvR+kWQ9iqHyN9hpW9oJQ48aDCblRmP6+xchTmVoZ69q0PkXVbZVCJsr0u87SwuB9UPZXmzStGtTid4iXlxCla5JBi7AgtekxFOkdY2jLbbonbtkZJEumU4vQqMabNjOfavJlhN21wP6x6lx7WEkz0GJZygSGdkQ38iRw6pC69pxg9RniT+EO3wsPQQEv8w9CQWDqqp9JYj7Fx0jyM6+wmBitHZlS99F6uWwxjqyJa9IhS4pxy4gwj7UO465RYOyo3lsJIT7PpyPS1IOPSJSs7zcSQ2FxXWLqaV8uNmWJYTTIsx7FkkXz4b6qxE4qRQw/8P4Mqnz2El1b6AAAAAElFTkSuQmCC')",
                  "backgroundSize": "cover",
                  "display": "block"
                }
              }}></span>{`
  `}<img parentName="a" {...{
                "className": "gatsby-resp-image-image",
                "alt": "kuzusjiji",
                "title": "a cursive writing style used for over 1000 years",
                "src": "/static/8e006bc7e38b5eca7cf30f51f07da335/77a9e/jpn-kuzushiji.png",
                "srcSet": ["/static/8e006bc7e38b5eca7cf30f51f07da335/12f09/jpn-kuzushiji.png 148w", "/static/8e006bc7e38b5eca7cf30f51f07da335/e4a3f/jpn-kuzushiji.png 295w", "/static/8e006bc7e38b5eca7cf30f51f07da335/77a9e/jpn-kuzushiji.png 483w"],
                "sizes": "(max-width: 483px) 100vw, 483px",
                "style": {
                  "width": "100%",
                  "height": "100%",
                  "margin": "0",
                  "verticalAlign": "middle",
                  "position": "absolute",
                  "top": "0",
                  "left": "0"
                },
                "loading": "lazy",
                "decoding": "async"
              }}></img>{`
  `}</a>{`
    `}</span>{`
    `}<figcaption parentName="figure" {...{
            "className": "gatsby-resp-image-figcaption"
          }}>{`a cursive writing style used for over 1000 years`}</figcaption>{`
  `}</figure>
        <ul parentName="li">
          <li parentName="ul"><a parentName="li" {...{
              "href": "https://web.archive.org/web/20200703075715/https://www3.nhk.or.jp/news/html/20191202/k10012198561000.html",
              "target": "_self",
              "rel": "nofollow"
            }}>{`Article (Japanese) about computational approaches to character recognition for `}<em parentName="a">{`kuzushiji`}</em></a></li>
          <li parentName="ul"><a parentName="li" {...{
              "href": "https://www.kaggle.com/c/kuzushiji-recognition/overview/about-kuzushiji",
              "target": "_self",
              "rel": "nofollow"
            }}>{`Kaggle challenge for recognizing `}<em parentName="a">{`kuzushiji`}</em></a></li>
        </ul>
      </li>
    </ul>
    <p>{`There is a vast trove of cultural information locked away in print media.`}</p>
    <h1 {...{
      "id": "next-steps",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#next-steps",
        "aria-label": "next steps permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Next steps`}</h1>
    <h1 {...{
      "id": "practice",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#practice",
        "aria-label": "practice permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Practice`}</h1>
    <ul>
      <li parentName="ul">{`Using the provided linguistic data, identify some differences in how `}<strong parentName="li">{`tokens`}</strong>{` are defined/delimited in `}<a parentName="li" {...{
          "href": "https://universaldependencies.org/en/#tokenization-and-word-segmentation",
          "target": "_self",
          "rel": "nofollow"
        }}>{`English`}</a>{` and `}<a parentName="li" {...{
          "href": "https://universaldependencies.org/ja/#tokenization-and-word-segmentation",
          "target": "_self",
          "rel": "nofollow"
        }}>{`Japanese`}</a>
        <ul parentName="li">
          <li parentName="ul">{`What patterns do you notice (if any) for tokenizing Japanese text?`}</li>
        </ul>
      </li>
      <li parentName="ul">{`Using the provided linguistic data, identify some differences in how `}<strong parentName="li">{`parts of speech`}</strong>{` are assigned in `}<a parentName="li" {...{
          "href": "https://universaldependencies.org/en/pos",
          "target": "_self",
          "rel": "nofollow"
        }}>{`English`}</a>{` and `}<a parentName="li" {...{
          "href": "https://universaldependencies.org/ja/pos/",
          "target": "_self",
          "rel": "nofollow"
        }}>{`Japanese`}</a></li>
    </ul>
    <h2 {...{
      "id": "additional-resources",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#additional-resources",
        "aria-label": "additional resources permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Additional resources`}</h2>
    <ul>
      <li parentName="ul"><a parentName="li" {...{
          "href": "https://wals.info/languoid/lect/wals_code_jpn",
          "target": "_self",
          "rel": "nofollow"
        }}>{`WALS entry for Japanese`}</a></li>
      <li parentName="ul"><a parentName="li" {...{
          "href": "https://www.aclweb.org/anthology/L16-1261.pdf",
          "target": "_self",
          "rel": "nofollow"
        }}>{`Universal Dependencies for Japanese`}</a></li>
      <li parentName="ul"><a parentName="li" {...{
          "href": "https://www.jstage.jst.go.jp/article/jnlp/26/1/26_3/_pdf/-char/ja",
          "target": "_self",
          "rel": "nofollow"
        }}>{`Japanese Universal Dependencies Corpora (in Japanese)`}</a></li>
    </ul>
    <h2 {...{
      "id": "datasets",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#datasets",
        "aria-label": "datasets permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Datasets`}</h2>
    <ul>
      <li parentName="ul"><a parentName="li" {...{
          "href": "https://universaldependencies.org/treebanks/ja_bccwj/index.html",
          "target": "_self",
          "rel": "nofollow"
        }}>{`UD Japanese BCCWJ`}</a></li>
    </ul>

    <div {...{
      "className": "footnotes"
    }}>
      <hr parentName="div"></hr>
      <ol parentName="div">
        <li parentName="ol" {...{
          "id": "fn-1"
        }}>{`The bottom row of this table uses `}<a parentName="li" {...{
            "href": "https://www.eva.mpg.de/lingua/pdf/Glossing-Rules.pdf",
            "target": "_self",
            "rel": "nofollow"
          }}>{`https://www.eva.mpg.de/lingua/pdf/Glossing-Rules.pdf`}</a><a parentName="li" {...{
            "href": "#fnref-1",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-2"
        }}>{`See `}<a parentName="li" {...{
            "href": "https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E8%AA%9E",
            "target": "_self",
            "rel": "nofollow"
          }}>{`https://ja.wikipedia.org/wiki/%E6%97%A5%E6%9C%AC%E8%AA%9E`}</a><a parentName="li" {...{
            "href": "#fnref-2",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-3"
        }}>{`From `}<a parentName="li" {...{
            "href": "https://ima.goo.ne.jp/column/article/3948.html",
            "target": "_self",
            "rel": "nofollow"
          }}>{`https://ima.goo.ne.jp/column/article/3948.html`}</a>{` (now defunct)`}<a parentName="li" {...{
            "href": "#fnref-3",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
      </ol>
    </div>
    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      