import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */
/* @jsx mdx */
import DefaultLayout from "/home/runner/work/myedibleenso.github.io/myedibleenso.github.io/src/components/BasicLayout.js";
import { HTMLTable } from '@blueprintjs/core';
export const _frontmatter = {};
const layoutProps = {
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">



    <h1 {...{
      "id": "overview",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#overview",
        "aria-label": "overview permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Overview`}</h1>
    <p>{`Welcome to `}<strong parentName="p">{`Intro to NLP: Representing words and documents as vectors`}</strong>{`.  This is an introductory-level NLP tutorial for `}<a parentName="p" {...{
        "href": "https://researchbazaar.arizona.edu/resbaz/resbazTucson2022/",
        "target": "_self",
        "rel": "nofollow"
      }}>{`Resbaz 2022`}</a>{`!`}</p>
    <p>{`In this short workshop (1-2 hours), we'll look at how to represent words and documents as vectors and compare them.  These representations can be used to cluster information or train statistical classifiers for various tasks.`}</p>
    <h2 {...{
      "id": "description",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#description",
        "aria-label": "description permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Description`}</h2>
    <p>{`Natural Language Processing (NLP) is an applied field of study at the intersection of linguistics, computer science, and machine learning that examines automated ways of making sense of natural language.  `}</p>
    <p>{`NLP tools are all around us. `}</p>
    <HTMLTable condensed striped mdxType="HTMLTable">
  <tbody>
    <tr>
      <td>
        <p>Chatbots</p>
      </td>
      <td>
      </td>
      <td>
        <p>
          <ul>
            <li><a href="https://huggingface.co/spaces/ThomasSimonini/Chat-with-Gandalf-GPT-J6B">Chat with Gandalf (GPT-based demo)</a></li>
            <li>...</li>
          </ul>
        </p>
      </td>
    </tr>
    <tr>
      <td>
        <p>Virtual assistants</p>
      </td>
      <td>
      </td>
      <td>
        <p>
          <ul>
            <li><a href="https://www.apple.com/siri/">Siri</a></li>
            <li><a href="https://developer.amazon.com/en-US/alexa">Alexa</a></li>
            <li><a href="https://assistant.google.com">Google Assistant</a></li>
            <li><a href="https://mycroft.ai/">Mycroft</a></li>
            <li>...</li>
          </ul>
        </p>
      </td>
    </tr>
    <tr>
      <td>
        <p>SPAM filters</p>
      </td>
      <td>
      </td>
      <td>
        <p>
          <ul>
            <li><a href="https://huggingface.co/spaces/iSky/spam-detector">Email (demo)</a></li>
          </ul>
        </p>
      </td>
    </tr>
    <tr>
      <td>
        <p>Detecting and moderating hate speech</p>
      </td>
      <td>
      </td>
      <td>
        <p>
          <ul>
            <li><a href="https://huggingface.co/spaces/ucberkeley-dlab/measuring-hate-speech">Demo</a></li>
          </ul>
        </p>
      </td>
    </tr>
    <tr>
      <td>
        <p>Machine Translation</p>
      </td>
      <td>
      </td>
      <td>
        <p>
          <ul>
            <li><a href="https://huggingface.co/spaces/engmatic-earth/Eng-JPkeigo-translator_v1">English to Japanese (with honorifics)</a></li>
          </ul>
        </p>
      </td>
    </tr>
    <tr>
      <td>
        <p>Summarization and simplification</p>
      </td>
      <td>
      </td>
      <td>
        <p>
          ELI5, create accessible resources for learners, etc.
        </p>
      </td>
    </tr>
    <tr>
      <td>
        <p>Voice cloning</p>
      </td>
      <td>
      </td>
      <td>
        <p>
          Read <i>Harry Potter and the Philosopher's Stone</i> in the voice of Arnold Schwarzenegger
          <ul>
            <li><a href="https://github.com/NVIDIA/tacotron2">Tacotron2</a></li>
            <li><a href="https://github.com/jik876/hifi-gan">HiFi GAN</a></li>
          </ul>
        </p>
      </td>
    </tr>
    <tr>
      <td>
        <p>Sentiment analysis</p>
      </td>
      <td>
      </td>
      <td>
        <p>
          <a href="https://huggingface.co/spaces/jmansfield89/Tweet_NLP_Sentiment_Analysis">Was that tweet a compliment or complaint about your product/company?</a>
        </p>
      </td>
    </tr>
    <tr>
      <td>
        <p>Search</p>
      </td>
      <td>
      </td>
      <td>
        <p>
          <a href="https://www.google.com/search/howsearchworks/">What happens when you search for "sneakers" on Google?</a>
        </p>
      </td>
    </tr>
  </tbody>
    </HTMLTable>
    <p>{`NLP is a very broad field involving text, audio (speech), images (handwriting, layout analysis, etc.), and video (ex. signed languages) data for all of the world's languages (extant and extinct).  There are many different ways to represent this data.  In this workshop, we'll introduce some foundational concepts for 1) representing text data by engineering features to create vector-based representations of words and documents, as well as 2) methods for comparing such representations.`}</p>
    <p>{`Aside from our everyday life, NLP has made its way into just about every industry, including medicine, finance, advertising, defense, and gaming.`}</p>
    <h2 {...{
      "id": "prerequisites",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#prerequisites",
        "aria-label": "prerequisites permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Prerequisites`}</h2>
    <p>{`This workshop is meant to be accessible to people without a background in programming or advanced math (everything you need to get started you already covered in high school or earlier).`}</p>
    <p>{`To get the most out of this workshop, you should be comfortable with the basics of programming (ideally in Python) and have a working `}<a parentName="p" {...{
        "href": "https://docs.docker.com/get-docker/",
        "target": "_self",
        "rel": "nofollow"
      }}>{`Docker installation`}</a>{`.  `}</p>
    <p>{`If you're already familiar with the basics of programming in Python, you'll be able to follow along with the provided examples `}<strong parentName="p">{`(all examples use Python 3.8)`}</strong>{`.  `}</p>
    <h2 {...{
      "id": "objectives",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#objectives",
        "aria-label": "objectives permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Objectives`}</h2>
    <p>{`By the end of this workshop, you will be able to ...`}</p>
    <ul>
      <li parentName="ul">{`represent words and documents as vectors`}</li>
      <li parentName="ul">{`generate character and word `}<span parentName="li" {...{
          "className": "math math-inline"
        }}><span parentName="span" {...{
            "className": "katex"
          }}><span parentName="span" {...{
              "className": "katex-mathml"
            }}><math parentName="span" {...{
                "xmlns": "http://www.w3.org/1998/Math/MathML"
              }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                    "encoding": "application/x-tex"
                  }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
              "className": "katex-html",
              "aria-hidden": "true"
            }}><span parentName="span" {...{
                "className": "base"
              }}><span parentName="span" {...{
                  "className": "strut",
                  "style": {
                    "height": "0.4306em"
                  }
                }}></span><span parentName="span" {...{
                  "className": "mord mathnormal"
                }}>{`n`}</span></span></span></span></span>{`-grams`}</li>
      <li parentName="ul">{`compare vectors to find similar items`}</li>
    </ul>
    <h2 {...{
      "id": "location-and-times",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#location-and-times",
        "aria-label": "location and times permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Location and Times`}</h2>
    <p>{`This workshop is completely virtual.  `}</p>
    <h2 {...{
      "id": "author",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#author",
        "aria-label": "author permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Author`}</h2>
    <p>{`Hi! My name is `}<a parentName="p" {...{
        "href": "/about"
      }}>{`Gus Hahn-Powell`}</a>{`. `}</p>
    <p>{`I'm a computational linguist interested in ways we can use natural language processing to accelerate scientific discovery by mining millions of scholarly documents.`}</p>
    <table>
      <thead parentName="table">
        <tr parentName="thead">
          <th parentName="tr" {...{
            "align": null
          }}>{`Name`}</th>
          <th parentName="tr" {...{
            "align": null
          }}><a parentName="th" {...{
              "href": "https://linguistics.arizona.edu/person/gus-hahn-powell",
              "target": "_self",
              "rel": "nofollow"
            }}>{`Gus Hahn-Powell`}</a></th>
        </tr>
      </thead>
      <tbody parentName="table">
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Email`}</td>
          <td parentName="tr" {...{
            "align": null
          }}><code parentName="td" {...{
              "className": "language-text"
            }}>{`hahnpowell AT arizona DOT edu`}</code></td>
        </tr>
        <tr parentName="tbody">
          <td parentName="tr" {...{
            "align": null
          }}>{`Appointments`}</td>
          <td parentName="tr" {...{
            "align": null
          }}><a parentName="td" {...{
              "href": "https://calendar.parsertongue.com",
              "target": "_self",
              "rel": "nofollow"
            }}>{`https://calendar.parsertongue.com`}</a></td>
        </tr>
      </tbody>
    </table>
    <h1 {...{
      "id": "tutorials",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#tutorials",
        "aria-label": "tutorials permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Tutorials`}</h1>
    <div {...{
      "className": "admonition admonition-note alert alert--secondary"
    }}><div parentName="div" {...{
        "className": "admonition-heading"
      }}><h5 parentName="div"><span parentName="h5" {...{
            "className": "admonition-icon"
          }}><svg parentName="span" {...{
              "xmlns": "http://www.w3.org/2000/svg",
              "width": "14",
              "height": "16",
              "viewBox": "0 0 14 16"
            }}><path parentName="svg" {...{
                "fillRule": "evenodd",
                "d": "M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"
              }}></path></svg></span>{`note`}</h5></div><div parentName="div" {...{
        "className": "admonition-content"
      }}><p parentName="div">{`Complete these tutorials in the order listed:`}</p></div></div>
    <ol>
      <li parentName="ol">
        <p parentName="li"><a parentName="p" {...{
            "href": "/tutorials/feature-vectors"
          }}>{`Representing words & documents`}</a></p>
      </li>
      <li parentName="ol">
        <p parentName="li"><a parentName="p" {...{
            "href": "/tutorials/n-grams"
          }}><span parentName="a" {...{
              "className": "math math-inline"
            }}><span parentName="span" {...{
                "className": "katex"
              }}><span parentName="span" {...{
                  "className": "katex-mathml"
                }}><math parentName="span" {...{
                    "xmlns": "http://www.w3.org/1998/Math/MathML"
                  }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                        "encoding": "application/x-tex"
                      }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
                  "className": "katex-html",
                  "aria-hidden": "true"
                }}><span parentName="span" {...{
                    "className": "base"
                  }}><span parentName="span" {...{
                      "className": "strut",
                      "style": {
                        "height": "0.4306em"
                      }
                    }}></span><span parentName="span" {...{
                      "className": "mord mathnormal"
                    }}>{`n`}</span></span></span></span></span>{`-grams`}</a></p>
      </li>
      <li parentName="ol">
        <p parentName="li"><a parentName="p" {...{
            "href": "/tutorials/vector-basics"
          }}>{`Vector basics`}</a></p>
      </li>
      <li parentName="ol">
        <p parentName="li"><a parentName="p" {...{
            "href": "/tutorials/distance-and-similarity"
          }}>{`Comparing vectors`}</a></p>
      </li>
    </ol>
    <h3 {...{
      "id": "supplemental",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#supplemental",
        "aria-label": "supplemental permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Supplemental`}</h3>
    <p>{`Ready to speed up your comparisons? Start by familiarizing yourself with the `}<a parentName="p" {...{
        "href": "https://numpy.org/",
        "target": "_self",
        "rel": "nofollow"
      }}>{`NumPy`}</a>{` library for fast numerical computing:`}</p>
    <p><undefined parentName="p">{`
        `}<div {...{
          "className": "embedVideo-container"
        }}>{`
            `}<iframe parentName="div" {...{
            "title": "",
            "width": 800,
            "height": 400,
            "src": "https://www.youtube-nocookie.com/embed/kEZLrVW-9Eg?rel=0",
            "className": "embedVideo-iframe",
            "style": {
              "border": "0"
            },
            "loading": "eager",
            "allowFullScreen": true,
            "sandbox": "allow-same-origin allow-scripts allow-popups"
          }}></iframe>{`
        `}</div></undefined></p>
    <h1 {...{
      "id": "practice",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#practice",
        "aria-label": "practice permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Practice`}</h1>
    <p>{`Once you've completed the above tutorials, `}<a parentName="p" {...{
        "href": "https://arizona.openclass.ai/invite?code=-bo0EHnA1L4ABA",
        "target": "_self",
        "rel": "nofollow"
      }}>{`review and practice what we've covered`}</a>{`.`}</p>
    <h2 {...{
      "id": "word-embeddings",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#word-embeddings",
        "aria-label": "word embeddings permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Word embeddings`}</h2>
    <p>{`While this tutorial looked at ways of engineering features for word and document vectors, it's also possible to learn representations.`}<sup parentName="p" {...{
        "id": "fnref-1"
      }}><a parentName="sup" {...{
          "href": "#fn-1",
          "className": "footnote-ref"
        }}>{`1`}</a></sup></p>
    <p>{`Apply what you've learned in this workshop to `}<a parentName="p" {...{
        "href": "http://vectors.nlpl.eu/repository/20/3.zip",
        "target": "_self",
        "rel": "nofollow"
      }}>{`explore a set of pre-trained word embeddings`}</a>{`.`}</p>
    <ol>
      <li parentName="ol">
        <p parentName="li">{`Load the word embeddings into a dictionary that maps each word to a numpy array.`}</p>
      </li>
      <li parentName="ol">
        <p parentName="li">{`Using cosine similarity as your metric, what are the 10 most-similar words to "dog"?`}</p>
      </li>
      <li parentName="ol">
        <p parentName="li">{`Average the embeddings for "pizza" and "pineapple".  Using cosine similarity as your metric, what are the 10 most-similar words to this averaged embedding?`}</p>
      </li>
      <li parentName="ol">
        <p parentName="li">{`For each of the following sentences, sum the embeddings for each word in the sentence:  `}</p>
      </li>
    </ol>
    <ul>
      <li parentName="ul"><em parentName="li">{`That bodacious green teen with the nunchaku is a ninja turtle.`}</em></li>
      <li parentName="ul"><em parentName="li">{`I glimpsed a stealthy shinobi slip silently through the shadows.`}</em></li>
      <li parentName="ul"><em parentName="li">{`Turtle soup is not something you want to slurp in the company of friendly reptiles.`}</em></li>
      <li parentName="ul"><em parentName="li">{`Do you put pineapple on your pizza?`}</em></li>
      <li parentName="ul"><em parentName="li">{`Finish your fettuccine alfredo before taking a bite of your canoli.`}</em></li>
    </ul>
    <p>{`Using cosine similarity as your metric, what sentence is most similar to `}<em parentName="p">{`I like turtles`}</em>{`?`}</p>
    <p>{`Repeat your experiment by averaging the embeddings, rather than summing.  What do you notice?`}</p>
    <h1 {...{
      "id": "next-steps",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#next-steps",
        "aria-label": "next steps permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Next steps`}</h1>
    <p>{`Interested in learning more?  Are you a UA student?  Consider taking `}<a parentName="p" {...{
        "href": "/courses/hlt-1"
      }}>{`LING 529 (HLT I)`}</a>{`, `}<a parentName="p" {...{
        "href": "/courses/snlp-1"
      }}>{`LING 539 (Intro to Statistical NLP)`}</a>{`, and/or `}<a parentName="p" {...{
        "href": "/courses/snlp-2"
      }}>{`LING 582 (Advanced Statistical NLP)`}</a>{`.  `}</p>
    <p>{`All three are offered as 7.5-week asychronous online courses as part of our online `}<a parentName="p" {...{
        "href": "https://uazhlt.github.io/hlt-online/about",
        "target": "_self",
        "rel": "nofollow"
      }}>{`MS in Human Language Technology`}</a>{`.`}</p>
    <h1 {...{
      "id": "footnotes",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#footnotes",
        "aria-label": "footnotes permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Footnotes`}</h1>

    <div {...{
      "className": "footnotes"
    }}>
      <hr parentName="div"></hr>
      <ol parentName="div">
        <li parentName="ol" {...{
          "id": "fn-1"
        }}>{`If you're curious to understand `}<em parentName="li">{`how`}</em>{` and `}<em parentName="li">{`why`}</em>{` you might want to learn such representations, consider taking `}<a parentName="li" {...{
            "href": "/courses/snlp-1"
          }}>{`LING 539`}</a>{`.  `}<a parentName="li" {...{
            "href": "#fnref-1",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
      </ol>
    </div>
    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      