import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */

/* @jsx mdx */

import DefaultLayout from "/home/runner/work/myedibleenso.github.io/myedibleenso.github.io/src/components/BasicLayout.js";
export const _frontmatter = {};
const layoutProps = {
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">


    <h1 {...{
      "id": "overview",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#overview",
        "aria-label": "overview permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Overview`}</h1>
    <p>{`This tutorial provides some useful default settings and strategies when using deep learning for NLP.  `}</p>
    <h1 {...{
      "id": "resources",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#resources",
        "aria-label": "resources permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Resources`}</h1>
    <ul>
      <li parentName="ul">
        <p parentName="li">{`Yoav Goldberg's `}<a parentName="p" {...{
            "href": "https://arizona-primo.hosted.exlibrisgroup.com/permalink/f/6ljalh/01UA_ALMA51534702160003843",
            "target": "_self",
            "rel": "nofollow"
          }}><em parentName="a">{`Neural Network Methods for NLP`}</em></a><undefined parentName="p">{` `}<span {...{
              "id": "citation-0",
              "data-hover": ""
            }}><span parentName="span" {...{
                "className": "citation-number"
              }}>{`[?]`}</span></span>{` is full of practical tricks for training various types of neural networks. `}</undefined></p>
      </li>
      <li parentName="ul">
        <p parentName="li">{`For a deeper dive into task-specific tricks, see Sebastian Ruder's more extensive  `}<a parentName="p" {...{
            "href": "https://www.ruder.io/deep-learning-nlp-best-practices/",
            "target": "_self",
            "rel": "nofollow"
          }}><em parentName="a">{`Deep Learning for NLP Best Practices`}</em></a><undefined parentName="p">{` `}<span {...{
              "id": "citation-0",
              "data-hover": ""
            }}><span parentName="span" {...{
                "className": "citation-number"
              }}>{`[?]`}</span></span>{`. This tutorial omits discussion of RNNs and transformers (topics we will cover elsewhere).`}</undefined></p>
      </li>
    </ul>
    <h1 {...{
      "id": "outcomes",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#outcomes",
        "aria-label": "outcomes permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Outcomes`}</h1>
    <p>{`After completing this lesson, you'll be able to ...`}</p>
    <ul>
      <li parentName="ul">{`detect vanishing gradients`}</li>
      <li parentName="ul">{`identify potential solutions for addressing vanishing gradients`}</li>
      <li parentName="ul">{`detect exploding gradients`}</li>
      <li parentName="ul">{`identify potential solutions for addressing exploding gradients`}</li>
      <li parentName="ul">{`implement regularization strategies in PyTorch`}</li>
    </ul>
    {
      /* # Reproducibility
      >Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds.
      https://pytorch.org/docs/stable/notes/randomness.html
      */
    }
    <h1 {...{
      "id": "problems",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#problems",
        "aria-label": "problems permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Problems`}</h1>
    <p>{`Let's look at some commonly encountered problems in deep learning and techniques that can hep address these problems.`}</p>
    <div {...{
      "className": "admonition admonition-note alert alert--secondary"
    }}><div parentName="div" {...{
        "className": "admonition-heading"
      }}><h5 parentName="div"><span parentName="h5" {...{
            "className": "admonition-icon"
          }}><svg parentName="span" {...{
              "xmlns": "http://www.w3.org/2000/svg",
              "width": "14",
              "height": "16",
              "viewBox": "0 0 14 16"
            }}><path parentName="svg" {...{
                "fillRule": "evenodd",
                "d": "M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"
              }}></path></svg></span>{`Not unique to DL`}</h5></div><div parentName="div" {...{
        "className": "admonition-content"
      }}><p parentName="div">{`Some of the problems discussed here are not at all unique to deep learning. `}</p></div></div>
    <h2 {...{
      "id": "imbalanced-data",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#imbalanced-data",
        "aria-label": "imbalanced data permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Imbalanced data`}</h2>
    <p>{`Class imbalance (an unequal proportion of each class in training data) is common in datasets.  Due to annotation costs or rarity, you may have few examples of a particular class.  Training on the data as-is will introduce a label bias (the model will learn to expect one class to be more common than another).  To avoid such a bias, you need some way of rebalancing the data or weighting your loss.`}</p>
    <h3 {...{
      "id": "addressing-imbalanced-data",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#addressing-imbalanced-data",
        "aria-label": "addressing imbalanced data permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Addressing imbalanced data`}</h3>
    <h4 {...{
      "id": "undersampling",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h4" {...{
        "href": "#undersampling",
        "aria-label": "undersampling permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Undersampling`}</h4>
    <p>{`One option for addressing class imbalance is to find the size of the smallest class and discard items from the other classes until all classes are equally represented (i.e., all have the number of datapoints as the minority class); however, this may toss out useful examples that differ from the included datapoints in important ways that might limit the model's ability to generalize.`}</p>
    <p><figure parentName="p" {...{
        "className": "gatsby-resp-image-figure",
        "style": {}
      }}>{`
    `}<span parentName="figure" {...{
          "className": "gatsby-resp-image-wrapper",
          "style": {
            "position": "relative",
            "display": "block",
            "marginLeft": "auto",
            "marginRight": "auto",
            "maxWidth": "590px"
          }
        }}>{`
      `}<a parentName="span" {...{
            "className": "gatsby-resp-image-link",
            "href": "/static/d54bca51a60d08f55c826305911a5c35/a5d4d/got-a-stew-going.webp",
            "style": {
              "display": "block"
            },
            "target": "_blank",
            "rel": "noopener"
          }}>{`
    `}<span parentName="a" {...{
              "className": "gatsby-resp-image-background-image",
              "style": {
                "paddingBottom": "56.08108108108109%",
                "position": "relative",
                "bottom": "0",
                "left": "0",
                "backgroundImage": "url('data:image/webp;base64,UklGRnoAAABXRUJQVlA4IG4AAACwAwCdASoUAAsAPtFUo0uoJKMhsAgBABoJYwC7ABRe2So8ah58kAD+7ANM0C5Ivtj+E+ok2Xb+5fu/KSYxhtikuk5NjYXV1kT0xNbiPNLAu7+bbCUqmivhkP4+U0XlETt8N6NoUmlpW0c3yIAAAA==')",
                "backgroundSize": "cover",
                "display": "block"
              }
            }}></span>{`
  `}<img parentName="a" {...{
              "className": "gatsby-resp-image-image",
              "alt": "got a stew going",
              "title": "Carl Weathers (in Arrested Development): Whoa, whoa, whoa. There's still plenty of meat on that bone. Now you take this home, throw it in a pot, add some broth, a potato. Baby, you've got a stew going.",
              "src": "/static/d54bca51a60d08f55c826305911a5c35/5ca24/got-a-stew-going.webp",
              "srcSet": ["/static/d54bca51a60d08f55c826305911a5c35/cbe2e/got-a-stew-going.webp 148w", "/static/d54bca51a60d08f55c826305911a5c35/3084c/got-a-stew-going.webp 295w", "/static/d54bca51a60d08f55c826305911a5c35/5ca24/got-a-stew-going.webp 590w", "/static/d54bca51a60d08f55c826305911a5c35/dad35/got-a-stew-going.webp 885w", "/static/d54bca51a60d08f55c826305911a5c35/a5d4d/got-a-stew-going.webp 1000w"],
              "sizes": "(max-width: 590px) 100vw, 590px",
              "style": {
                "width": "100%",
                "height": "100%",
                "margin": "0",
                "verticalAlign": "middle",
                "position": "absolute",
                "top": "0",
                "left": "0"
              },
              "loading": "lazy",
              "decoding": "async"
            }}></img>{`
  `}</a>{`
    `}</span>{`
    `}<figcaption parentName="figure" {...{
          "className": "gatsby-resp-image-figcaption"
        }}>{`Carl Weathers (in Arrested Development): Whoa, whoa, whoa. There's still plenty of meat on that bone. Now you take this home, throw it in a pot, add some broth, a potato. Baby, you've got a stew going.`}</figcaption>{`
  `}</figure></p>
    <p>{`Another option is to randomly undersample each class in epoch to allow datapoints that were set aside in an earlier epoch another opportunity to appear.  In PyTorch, this can be achieved using the `}<a parentName="p" {...{
        "href": "https://pytorch.org/docs/stable/data.html?highlight=weightedrandomsampler#torch.utils.data.WeightedRandomSampler",
        "target": "_self",
        "rel": "nofollow"
      }}><code parentName="a" {...{
          "className": "language-text"
        }}>{`torch.utils.data.sampler.WeightedRandomSampler`}</code></a>{`.  This sampling can be performed with or without replacement. First, we need to assign a weight to each class.  One option to discount the majority class is to weight each class by its inverse frequency:`}</p>
    <p><span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><msub parentName="mrow"><mi parentName="msub">{`w`}</mi><msub parentName="msub"><mi parentName="msub">{`c`}</mi><mi parentName="msub">{`n`}</mi></msub></msub><mo parentName="mrow">{`=`}</mo><mfrac parentName="mrow"><mn parentName="mfrac">{`1`}</mn><mrow parentName="mfrac"><mtext parentName="mrow">{`count`}</mtext><mo parentName="mrow" {...{
                        "stretchy": "false"
                      }}>{`(`}</mo><msub parentName="mrow"><mi parentName="msub">{`c`}</mi><mi parentName="msub">{`n`}</mi></msub><mo parentName="mrow" {...{
                        "stretchy": "false"
                      }}>{`)`}</mo></mrow></mfrac></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`w_{c_{n}} = \\frac{1}{\\text{count}(c_{n})}`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6807em",
                  "verticalAlign": "-0.2501em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mord mathnormal",
                  "style": {
                    "marginRight": "0.02691em"
                  }
                }}>{`w`}</span><span parentName="span" {...{
                  "className": "msupsub"
                }}><span parentName="span" {...{
                    "className": "vlist-t vlist-t2"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.1514em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-2.55em",
                            "marginLeft": "-0.0269em",
                            "marginRight": "0.05em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "2.7em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mtight"
                              }}><span parentName="span" {...{
                                  "className": "mord mathnormal mtight"
                                }}>{`c`}</span><span parentName="span" {...{
                                  "className": "msupsub"
                                }}><span parentName="span" {...{
                                    "className": "vlist-t vlist-t2"
                                  }}><span parentName="span" {...{
                                      "className": "vlist-r"
                                    }}><span parentName="span" {...{
                                        "className": "vlist",
                                        "style": {
                                          "height": "0.1645em"
                                        }
                                      }}><span parentName="span" {...{
                                          "style": {
                                            "top": "-2.357em",
                                            "marginLeft": "0em",
                                            "marginRight": "0.0714em"
                                          }
                                        }}><span parentName="span" {...{
                                            "className": "pstrut",
                                            "style": {
                                              "height": "2.5em"
                                            }
                                          }}></span><span parentName="span" {...{
                                            "className": "sizing reset-size3 size1 mtight"
                                          }}><span parentName="span" {...{
                                              "className": "mord mtight"
                                            }}><span parentName="span" {...{
                                                "className": "mord mathnormal mtight"
                                              }}>{`n`}</span></span></span></span></span><span parentName="span" {...{
                                        "className": "vlist-s"
                                      }}>{`​`}</span></span><span parentName="span" {...{
                                      "className": "vlist-r"
                                    }}><span parentName="span" {...{
                                        "className": "vlist",
                                        "style": {
                                          "height": "0.143em"
                                        }
                                      }}><span parentName="span"></span></span></span></span></span></span></span></span></span></span><span parentName="span" {...{
                        "className": "vlist-s"
                      }}>{`​`}</span></span><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.2501em"
                        }
                      }}><span parentName="span"></span></span></span></span></span></span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`=`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1.3651em",
                  "verticalAlign": "-0.52em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mopen nulldelimiter"
                }}></span><span parentName="span" {...{
                  "className": "mfrac"
                }}><span parentName="span" {...{
                    "className": "vlist-t vlist-t2"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.8451em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-2.655em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "3em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord text mtight"
                              }}><span parentName="span" {...{
                                  "className": "mord mtight"
                                }}>{`count`}</span></span><span parentName="span" {...{
                                "className": "mopen mtight"
                              }}>{`(`}</span><span parentName="span" {...{
                                "className": "mord mtight"
                              }}><span parentName="span" {...{
                                  "className": "mord mathnormal mtight"
                                }}>{`c`}</span><span parentName="span" {...{
                                  "className": "msupsub"
                                }}><span parentName="span" {...{
                                    "className": "vlist-t vlist-t2"
                                  }}><span parentName="span" {...{
                                      "className": "vlist-r"
                                    }}><span parentName="span" {...{
                                        "className": "vlist",
                                        "style": {
                                          "height": "0.1645em"
                                        }
                                      }}><span parentName="span" {...{
                                          "style": {
                                            "top": "-2.357em",
                                            "marginLeft": "0em",
                                            "marginRight": "0.0714em"
                                          }
                                        }}><span parentName="span" {...{
                                            "className": "pstrut",
                                            "style": {
                                              "height": "2.5em"
                                            }
                                          }}></span><span parentName="span" {...{
                                            "className": "sizing reset-size3 size1 mtight"
                                          }}><span parentName="span" {...{
                                              "className": "mord mtight"
                                            }}><span parentName="span" {...{
                                                "className": "mord mathnormal mtight"
                                              }}>{`n`}</span></span></span></span></span><span parentName="span" {...{
                                        "className": "vlist-s"
                                      }}>{`​`}</span></span><span parentName="span" {...{
                                      "className": "vlist-r"
                                    }}><span parentName="span" {...{
                                        "className": "vlist",
                                        "style": {
                                          "height": "0.143em"
                                        }
                                      }}><span parentName="span"></span></span></span></span></span></span><span parentName="span" {...{
                                "className": "mclose mtight"
                              }}>{`)`}</span></span></span></span><span parentName="span" {...{
                          "style": {
                            "top": "-3.23em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "3em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "frac-line",
                            "style": {
                              "borderBottomWidth": "0.04em"
                            }
                          }}></span></span><span parentName="span" {...{
                          "style": {
                            "top": "-3.394em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "3em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mtight"
                              }}>{`1`}</span></span></span></span></span><span parentName="span" {...{
                        "className": "vlist-s"
                      }}>{`​`}</span></span><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.52em"
                        }
                      }}><span parentName="span"></span></span></span></span></span><span parentName="span" {...{
                  "className": "mclose nulldelimiter"
                }}></span></span></span></span></span></span></p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` torch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`utils`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`data `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` WeightedRandomSampler`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` Subset

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# define dataset, model, etc. ...`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`for`}</span>{` epoch `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`in`}</span>{` `}<span parentName="code" {...{
            "className": "token builtin"
          }}>{`range`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`1`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` max_epochs`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`+`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`1`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# indices of our weighted sample`}</span>{`
  indices `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` WeightedRandomSampler`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`weights`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`class_weights`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` num_samples`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token builtin"
          }}>{`len`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`dataset`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` replacement`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`True`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# select items from dataset by their indices`}</span>{`
  subset `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` Subset`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`dataset`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` indices`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  myloader `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` DataLoader`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`subset`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`for`}</span>{` i`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` batch `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`in`}</span>{` `}<span parentName="code" {...{
            "className": "token builtin"
          }}>{`enumerate`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`myloader`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
    `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# optionally track current iteration`}</span>{`
    iteration `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` epoch `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`*`}</span>{` i
    `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# train as usual`}</span></code></pre></div>
    <h1 {...{
      "id": "overfitting",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#overfitting",
        "aria-label": "overfitting permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Overfitting`}</h1>
    {
      /* ## Causes of overfitting */
    }
    <p>{`Because DNNs have such a large number of parameters, it is very easy to inadvertently overfit when training (memorizing the training data without generalizing to unseen data).`}</p>
    <h2 {...{
      "id": "avoiding-overfitting",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#avoiding-overfitting",
        "aria-label": "avoiding overfitting permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Avoiding overfitting`}</h2>
    <p>{`There are several strategies we can adopt to reduce the risk of overfitting.`}</p>
    <h3 {...{
      "id": "fewer-parameters",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#fewer-parameters",
        "aria-label": "fewer parameters permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Fewer parameters`}</h3>
    <p>{`Reducing the overall number of parameters will help to address overfitting. Consider reducing the size of each layer (fewer hidden units) and the number of hidden layers (try a shallower network).`}</p>
    <h3 {...{
      "id": "weight-decay",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#weight-decay",
        "aria-label": "weight decay permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Weight decay`}</h3>
    <p>{`Functionally equivalent to L2 regularization, weight decay is a way of controlling the magnitude of the weights by downscaling according to some hyperparameter `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`λ`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\lambda`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`λ`}</span></span></span></span></span>{`.  PyTorch's implementation of optimizers like `}<a parentName="p" {...{
        "href": "https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.SGD",
        "target": "_self",
        "rel": "nofollow"
      }}>{`SGD`}</a>{` and `}<a parentName="p" {...{
        "href": "https://pytorch.org/docs/stable/generated/torch.optim.Adam.html#torch.optim.Adam",
        "target": "_self",
        "rel": "nofollow"
      }}>{`Adam`}</a>{` support weight decay as an optional hyperparameter. Increasing the value will increase the penalty on large weights:`}</p>
    <p><span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mfrac parentName="mrow"><mrow parentName="mfrac"><mi parentName="mrow">{`δ`}</mi><mi parentName="mrow">{`L`}</mi></mrow><mrow parentName="mfrac"><mi parentName="mrow">{`δ`}</mi><msub parentName="mrow"><mi parentName="msub">{`w`}</mi><mi parentName="msub">{`i`}</mi></msub></mrow></mfrac><mo parentName="mrow">{`+`}</mo><mi parentName="mrow">{`λ`}</mi><msub parentName="mrow"><mi parentName="msub">{`w`}</mi><mi parentName="msub">{`i`}</mi></msub></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\frac{\\delta{L}}{\\delta w_{i}} + \\lambda w_{i}`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1.3252em",
                  "verticalAlign": "-0.4451em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mopen nulldelimiter"
                }}></span><span parentName="span" {...{
                  "className": "mfrac"
                }}><span parentName="span" {...{
                    "className": "vlist-t vlist-t2"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.8801em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-2.655em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "3em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mathnormal mtight",
                                "style": {
                                  "marginRight": "0.03785em"
                                }
                              }}>{`δ`}</span><span parentName="span" {...{
                                "className": "mord mtight"
                              }}><span parentName="span" {...{
                                  "className": "mord mathnormal mtight",
                                  "style": {
                                    "marginRight": "0.02691em"
                                  }
                                }}>{`w`}</span><span parentName="span" {...{
                                  "className": "msupsub"
                                }}><span parentName="span" {...{
                                    "className": "vlist-t vlist-t2"
                                  }}><span parentName="span" {...{
                                      "className": "vlist-r"
                                    }}><span parentName="span" {...{
                                        "className": "vlist",
                                        "style": {
                                          "height": "0.3281em"
                                        }
                                      }}><span parentName="span" {...{
                                          "style": {
                                            "top": "-2.357em",
                                            "marginLeft": "-0.0269em",
                                            "marginRight": "0.0714em"
                                          }
                                        }}><span parentName="span" {...{
                                            "className": "pstrut",
                                            "style": {
                                              "height": "2.5em"
                                            }
                                          }}></span><span parentName="span" {...{
                                            "className": "sizing reset-size3 size1 mtight"
                                          }}><span parentName="span" {...{
                                              "className": "mord mtight"
                                            }}><span parentName="span" {...{
                                                "className": "mord mathnormal mtight"
                                              }}>{`i`}</span></span></span></span></span><span parentName="span" {...{
                                        "className": "vlist-s"
                                      }}>{`​`}</span></span><span parentName="span" {...{
                                      "className": "vlist-r"
                                    }}><span parentName="span" {...{
                                        "className": "vlist",
                                        "style": {
                                          "height": "0.143em"
                                        }
                                      }}><span parentName="span"></span></span></span></span></span></span></span></span></span><span parentName="span" {...{
                          "style": {
                            "top": "-3.23em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "3em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "frac-line",
                            "style": {
                              "borderBottomWidth": "0.04em"
                            }
                          }}></span></span><span parentName="span" {...{
                          "style": {
                            "top": "-3.394em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "3em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mathnormal mtight",
                                "style": {
                                  "marginRight": "0.03785em"
                                }
                              }}>{`δ`}</span><span parentName="span" {...{
                                "className": "mord mtight"
                              }}><span parentName="span" {...{
                                  "className": "mord mathnormal mtight"
                                }}>{`L`}</span></span></span></span></span></span><span parentName="span" {...{
                        "className": "vlist-s"
                      }}>{`​`}</span></span><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.4451em"
                        }
                      }}><span parentName="span"></span></span></span></span></span><span parentName="span" {...{
                  "className": "mclose nulldelimiter"
                }}></span></span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2222em"
                }
              }}></span><span parentName="span" {...{
                "className": "mbin"
              }}>{`+`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2222em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.8444em",
                  "verticalAlign": "-0.15em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`λ`}</span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mord mathnormal",
                  "style": {
                    "marginRight": "0.02691em"
                  }
                }}>{`w`}</span><span parentName="span" {...{
                  "className": "msupsub"
                }}><span parentName="span" {...{
                    "className": "vlist-t vlist-t2"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.3117em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-2.55em",
                            "marginLeft": "-0.0269em",
                            "marginRight": "0.05em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "2.7em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mathnormal mtight"
                              }}>{`i`}</span></span></span></span></span><span parentName="span" {...{
                        "className": "vlist-s"
                      }}>{`​`}</span></span><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.15em"
                        }
                      }}><span parentName="span"></span></span></span></span></span></span></span></span></span></span></p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` torch `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` optim

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# model here`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# ...`}</span>{`

optimizer `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` optim`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`SGD`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`model`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`parameters`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` lr`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`0.01`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` weight_decay`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`0.5`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <h3 {...{
      "id": "dropout",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#dropout",
        "aria-label": "dropout permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Dropout`}</h3>
    <p>{`At each forward pass `}<strong parentName="p">{`during training`}</strong>{` with some probability `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`p`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`p`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.625em",
                  "verticalAlign": "-0.1944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`p`}</span></span></span></span></span>{`, each neuron has a chance to have its activation set to 0.  A good default value for `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`p`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`p`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.625em",
                  "verticalAlign": "-0.1944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`p`}</span></span></span></span></span>{` commonly used in many NLP tasks is 0.5.`}</p>
    <h3 {...{
      "id": "early-stopping",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#early-stopping",
        "aria-label": "early stopping permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Early stopping`}</h3>
    <p>{`A lower loss on the training data doesn't always guarantee a more reliable model.  To determine this, it is important to monitor your model's performance on some held-out data to better gauge the model's ability to generalize to unseen data.  Often, this means setting aside a portion of your training data for development or validation (ex. a stratified selection of 10% of your training data).  You may not want to perform this validation check after each iteration, as it may be noisy (especially early in training) and time-consuming.  Instead, you may instead choose to do it every `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`k`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`k`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03148em"
                }
              }}>{`k`}</span></span></span></span></span>{` iterations or at the end of each training `}<strong parentName="p">{`epoch`}</strong>{`.  `}</p>
    <p>{`Rather than training for a fixed number of epochs, it is better to specify a maximum number of epochs. By monitoring performance on training and validation partitions, you may find an earlier timestep to end training before your model begins to overfit or memorize the training data. This technique is known as `}<strong parentName="p">{`early stopping`}</strong>{`.  If performance on the validation partition begins to degrade while the loss on the training data continues to decrease, your model is likely `}<strong parentName="p">{`overfitting`}</strong>{`.`}</p>
    <p>{`If the degradation on validation is "small", though, you may want to wait to see if the degradation is a temporary flucuation (ex. a local instability) or representative of a trend.  Once performance on validation has continued to show a degradation (a hyperparameter known as `}<strong parentName="p">{`patience`}</strong>{`), you will want to stop training and use the best snapshot of your model.  While PyTorch doesn't provide an out-of-the-box implementation for early stopping, many high-level wrappers like `}<a parentName="p" {...{
        "href": "https://www.pytorchlightning.ai/",
        "target": "_self",
        "rel": "nofollow"
      }}><code parentName="a" {...{
          "className": "language-text"
        }}>{`pytorch-lightning`}</code></a>{` and `}<a parentName="p" {...{
        "href": "https://pytorch-ignite.ai/",
        "target": "_self",
        "rel": "nofollow"
      }}><code parentName="a" {...{
          "className": "language-text"
        }}>{`pytorch-ignite`}</code></a>{` have implementations.  Here is an example of checkpointing and early stopping using `}<a parentName="p" {...{
        "href": "https://pytorch-ignite.ai/",
        "target": "_self",
        "rel": "nofollow"
      }}><code parentName="a" {...{
          "className": "language-text"
        }}>{`pytorch-ignite`}</code></a>{`: `}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` ignite`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`engine `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` Engine
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` ignite`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`handlers `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` EarlyStopping`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` ModelCheckpoint
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` ignite`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`metrics `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` Accuracy`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` Loss`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` Precision`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` Recall

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# define model, loss, optimizer, `}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# train_dataloader, validation_dataloader, etc.`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# ...`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# set device and configure model `}</span>{`
device `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"cpu"`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`if`}</span>{` torch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`cuda`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`is_available`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
  device `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"cuda"`}</span>{`
  model`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`cuda`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`device`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# see https://pytorch.org/ignite/metrics.html#metric-arithmetics`}</span>{`
precision `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` Precision`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`average`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`False`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
recall `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` Recall`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`average`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`False`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# we can compose metrics`}</span>{`
F1 `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`precision `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`*`}</span>{` recall `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`*`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`2`}</span>{` `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`/`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`precision `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`+`}</span>{` recall`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`mean`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

metrics `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`{`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"accuracy"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` Accuracy`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"precision"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` precision`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"recall"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` recall`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"f1"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` F1`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"loss"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` Loss`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`criterion`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`}`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# just a function that defines `}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# how we process a batch in training`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`def`}</span>{` `}<span parentName="code" {...{
            "className": "token function"
          }}>{`train_step`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`engine`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` batch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
  model`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  optimizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`zero_grad`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  x`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` y_true `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` batch
  x `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` x`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`to`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`device`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  y_true `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` y_true`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`to`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`device`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  y_hat `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` model`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`x`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  loss `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` criterion`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`y_hat`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` y_true`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span><span parentName="code" {...{
            "className": "token builtin"
          }}>{`float`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  loss`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`backward`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  optimizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`step`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`return`}</span>{` loss`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`item`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# a function that defines `}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# how we process data for evaluation`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# adjust as appropriate`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`def`}</span>{` `}<span parentName="code" {...{
            "className": "token function"
          }}>{`validation_step`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`engine`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` batch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
  model`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span><span parentName="code" {...{
            "className": "token builtin"
          }}>{`eval`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  batch `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`{`}</span>{`k`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` v`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`to`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`device`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{` `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`for`}</span>{` k`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` v `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`in`}</span>{` batch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`items`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`}`}</span>{`
  x`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` y `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` batch
  `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`with`}</span>{` torch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`no_grad`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
    logits `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` model`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`x`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
    predictions `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` torch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`argmax`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`logits`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` dim`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token operator"
          }}>{`-`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`1`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
    `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`return`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`{`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`'y_pred'`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` predictions`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`'y'`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` y`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`}`}</span>{`


trainer `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` Engine`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`train_step`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

evaluator `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` Engine`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`validation_step`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# attach metrics to our evaluator`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`for`}</span>{` name`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` metric `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`in`}</span>{` metrics`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`items`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
    metric`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`attach`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`evaluator`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` name`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`


n `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`10`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# after every 10 iterations (batches), `}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# evaluate the model on both train and validation sets`}</span>{`
`}<span parentName="code" {...{
            "className": "token decorator annotation punctuation"
          }}>{`@trainer`}<span parentName="span" {...{
              "className": "token punctuation"
            }}>{`.`}</span>{`on`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`Events`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`ITERATION_COMPLETED`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`every`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`n`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`def`}</span>{` `}<span parentName="code" {...{
            "className": "token function"
          }}>{`log_performance`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`trainer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# run against training`}</span>{`
  evaluator`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`run`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`train_dataloader`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  metrics `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` evaluator`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`state`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`metrics
  `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`print`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token string-interpolation"
          }}><span parentName="span" {...{
              "className": "token string"
            }}>{`f"""
  Training (Epoch `}</span><span parentName="span" {...{
              "className": "token interpolation"
            }}><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`{`}</span>{`trainer`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`.`}</span>{`state`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`.`}</span>{`epoch`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`}`}</span></span><span parentName="span" {...{
              "className": "token string"
            }}>{`)
  ........................................ 
  Precision: `}</span><span parentName="span" {...{
              "className": "token interpolation"
            }}><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`{`}</span>{`metrics`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`[`}</span><span parentName="span" {...{
                "className": "token string"
              }}>{`'precision'`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`]`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`:`}</span><span parentName="span" {...{
                "className": "token format-spec"
              }}>{`.2f`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`}`}</span></span><span parentName="span" {...{
              "className": "token string"
            }}>{` 
      Recall: `}</span><span parentName="span" {...{
              "className": "token interpolation"
            }}><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`{`}</span>{`metrics`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`[`}</span><span parentName="span" {...{
                "className": "token string"
              }}>{`'recall'`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`]`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`:`}</span><span parentName="span" {...{
                "className": "token format-spec"
              }}>{`.2f`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`}`}</span></span><span parentName="span" {...{
              "className": "token string"
            }}>{` 
          F1: `}</span><span parentName="span" {...{
              "className": "token interpolation"
            }}><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`{`}</span>{`metrics`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`[`}</span><span parentName="span" {...{
                "className": "token string"
              }}>{`'f1'`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`]`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`:`}</span><span parentName="span" {...{
                "className": "token format-spec"
              }}>{`.2f`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`}`}</span></span><span parentName="span" {...{
              "className": "token string"
            }}>{`
  ........................................
  """`}</span></span>{`
  `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# run against validation`}</span>{`
  evaluator`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`run`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`validation_dataloader`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  metrics `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` evaluator`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`state`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`metrics
  `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`print`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token string-interpolation"
          }}><span parentName="span" {...{
              "className": "token string"
            }}>{`f"""
  Validation (Epoch `}</span><span parentName="span" {...{
              "className": "token interpolation"
            }}><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`{`}</span>{`trainer`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`.`}</span>{`state`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`.`}</span>{`epoch`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`}`}</span></span><span parentName="span" {...{
              "className": "token string"
            }}>{`)
  ........................................ 
  Precision: `}</span><span parentName="span" {...{
              "className": "token interpolation"
            }}><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`{`}</span>{`metrics`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`[`}</span><span parentName="span" {...{
                "className": "token string"
              }}>{`'precision'`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`]`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`:`}</span><span parentName="span" {...{
                "className": "token format-spec"
              }}>{`.2f`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`}`}</span></span><span parentName="span" {...{
              "className": "token string"
            }}>{` 
      Recall: `}</span><span parentName="span" {...{
              "className": "token interpolation"
            }}><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`{`}</span>{`metrics`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`[`}</span><span parentName="span" {...{
                "className": "token string"
              }}>{`'recall'`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`]`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`:`}</span><span parentName="span" {...{
                "className": "token format-spec"
              }}>{`.2f`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`}`}</span></span><span parentName="span" {...{
              "className": "token string"
            }}>{` 
          F1: `}</span><span parentName="span" {...{
              "className": "token interpolation"
            }}><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`{`}</span>{`metrics`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`[`}</span><span parentName="span" {...{
                "className": "token string"
              }}>{`'f1'`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`]`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`:`}</span><span parentName="span" {...{
                "className": "token format-spec"
              }}>{`.2f`}</span><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`}`}</span></span><span parentName="span" {...{
              "className": "token string"
            }}>{`
  ........................................
  """`}</span></span>{`
  `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

checkpoint_handler `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` ModelCheckpoint`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`dirname`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"/data/experiment-x/models/snapshots"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` filename_prefix`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`'my-model'`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` n_saved`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`2`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` create_dir`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`True`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# as an alternative to decorators, `}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# we can also register event handlers using .add_even_handler`}</span>{`
trainer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`add_event_handler`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`Events`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`ITERATION_COMPLETED`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`every`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`n`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` checkpoint_handler`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`{`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"model"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` model`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`}`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# in order to use early stopping,`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# we need to periodically record performance`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`def`}</span>{` `}<span parentName="code" {...{
            "className": "token function"
          }}>{`evaluate`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`engine`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
  evaluator`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`run`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`train_dataloader`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  evaluator`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`run`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`validation_dataloader`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

trainer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`add_event_handler`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`Events`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`EPOCH_STARTED`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` evaluate`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# now we can set up early stopping.`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# we could use any recorded metric we want.`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# here we'll use f1.`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`def`}</span>{` `}<span parentName="code" {...{
            "className": "token function"
          }}>{`early_stopping_using`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`engine`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
  `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`return`}</span>{` engine`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`state`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`metrics`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`'f1'`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span>{`

es_handler `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` EarlyStopping`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`
  patience`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`10`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
  score_function`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`early_stopping_using`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
  trainer`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`trainer
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
evaluator`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`add_event_handler`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`Events`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`EPOCH_COMPLETED`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` es_handler`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# train!`}</span>{`
trainer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`run`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`dataloader`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` max_epochs`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`50`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <h2 {...{
      "id": "failing-to-converge",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#failing-to-converge",
        "aria-label": "failing to converge permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Failing to converge`}</h2>
    <p>{`If your loss decreases and then starts vacillating between increasing and decreasing, you may need to adjust your learning rate to continue learning.`}</p>
    <h3 {...{
      "id": "dynamic-learning-rates",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#dynamic-learning-rates",
        "aria-label": "dynamic learning rates permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Dynamic learning rates`}</h3>
    <p>{`When you first begin training, the larger updates may safely be taken.  As learning progresses, smaller steps are often necessary to get a network to converge.  One way of achieving this is through `}<strong parentName="p">{`learning rate scheduling`}</strong>{` where for each update, the learning rate is calculated using the initial learning rate divided by the iteration (batch) number.`}</p>
    <p>{`In PyTorch, we can do something like the following:`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token keyword"
          }}>{`for`}</span>{` i`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` data `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`in`}</span>{` `}<span parentName="code" {...{
            "className": "token builtin"
          }}>{`enumerate`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`training_loader`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# data and labels for batch`}</span>{`
  x`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` y_true `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` data
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# zero gradients`}</span>{`
  optimizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`zero_grad`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# our forward pass`}</span>{`
  y_hat `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` model`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`x`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# calculate our loss`}</span>{`
  loss `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` criterion`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`y_hat`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` y_true`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# calculate our gradients`}</span>{`
  loss`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`backward`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  optimizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`param_groups`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`0`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"lr"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span>{` `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` starting_lr `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`/`}</span>{` i
  optimizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`step`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`  `}</code></pre></div>
    <p>{`The `}<a parentName="p" {...{
        "href": "https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate",
        "target": "_self",
        "rel": "nofollow"
      }}><code parentName="a" {...{
          "className": "language-text"
        }}>{`torch.optim.lr_scheduler`}</code>{` package`}</a>{` provides implementations for several alternatives for adaptive learning rates.`}</p>
    <h2 {...{
      "id": "unstable-learning",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#unstable-learning",
        "aria-label": "unstable learning permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Unstable learning`}</h2>
    <p>{`Is your loss going up and down?  This suggests there is some instability in your network or in how you've structured your training loop.`}</p>
    <h3 {...{
      "id": "addressing-unstable-learning",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#addressing-unstable-learning",
        "aria-label": "addressing unstable learning permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Addressing unstable learning`}</h3>
    <p>{`There are several changes that can be made to improve learning stability.`}</p>
    <h4 {...{
      "id": "increase-batch-size",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h4" {...{
        "href": "#increase-batch-size",
        "aria-label": "increase batch size permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Increase batch size`}</h4>
    <p>{`Larger batches may make for smoother learning/more consistent updates. Try doubling your batch size and training for a few epochs.  What do you observe?`}</p>
    <div {...{
      "className": "admonition admonition-note alert alert--secondary"
    }}><div parentName="div" {...{
        "className": "admonition-heading"
      }}><h5 parentName="div"><span parentName="h5" {...{
            "className": "admonition-icon"
          }}><svg parentName="span" {...{
              "xmlns": "http://www.w3.org/2000/svg",
              "width": "14",
              "height": "16",
              "viewBox": "0 0 14 16"
            }}><path parentName="svg" {...{
                "fillRule": "evenodd",
                "d": "M6.3 5.69a.942.942 0 0 1-.28-.7c0-.28.09-.52.28-.7.19-.18.42-.28.7-.28.28 0 .52.09.7.28.18.19.28.42.28.7 0 .28-.09.52-.28.7a1 1 0 0 1-.7.3c-.28 0-.52-.11-.7-.3zM8 7.99c-.02-.25-.11-.48-.31-.69-.2-.19-.42-.3-.69-.31H6c-.27.02-.48.13-.69.31-.2.2-.3.44-.31.69h1v3c.02.27.11.5.31.69.2.2.42.31.69.31h1c.27 0 .48-.11.69-.31.2-.19.3-.42.31-.69H8V7.98v.01zM7 2.3c-3.14 0-5.7 2.54-5.7 5.68 0 3.14 2.56 5.7 5.7 5.7s5.7-2.55 5.7-5.7c0-3.15-2.56-5.69-5.7-5.69v.01zM7 .98c3.86 0 7 3.14 7 7s-3.14 7-7 7-7-3.12-7-7 3.14-7 7-7z"
              }}></path></svg></span>{`increasing learning rate`}</h5></div><div parentName="div" {...{
        "className": "admonition-content"
      }}><p parentName="div">{`With larger batch sizes, it may be advantageous to take larger steps when updating parameters (i.e., increase your learning rate).`}</p></div></div>
    <h4 {...{
      "id": "gradient-accumulation",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h4" {...{
        "href": "#gradient-accumulation",
        "aria-label": "gradient accumulation permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Gradient accumulation`}</h4>
    <p>{`Memory limitations may not make it feasible to increase the batch size. One solution is to accumulate gradients over several batches, average them, and then make a single update:`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token comment"
          }}>{`# how many batches should we wait`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# before updating our parameters?`}</span>{`
accumulate_for `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`4`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# a tally of how many batches we've processed `}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# since we last updated our model's parameters`}</span>{`
batches_processed `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`0`}</span>{`
loss `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`0`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`for`}</span>{` i`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` data `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`in`}</span>{` `}<span parentName="code" {...{
            "className": "token builtin"
          }}>{`enumerate`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`training_loader`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# data and labels for batch`}</span>{`
  x`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` y_true `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` data
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# our forward pass`}</span>{`
  y_hat `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` model`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`x`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# calculate our loss`}</span>{`
  current_loss `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` criterion`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`y_hat`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` y_true`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# running average of our loss (for plotting, tracking, etc.)`}</span>{`
  loss `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`loss `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`+`}</span>{` current_loss`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{` `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`/`}</span>{` batches_processed
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# calculate (and accumulate gradient)`}</span>{`
  loss`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`backward`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`if`}</span>{` `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`batches_processed`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{` `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`%`}</span>{` accumulate_for `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`==`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`0`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
    optimizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`step`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
    `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# why not optimizer.zero_grad()? `}</span>{`
    `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# sometimes they're the equivalent (sometimes not).`}</span>{`
    `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# see https://stackoverflow.com/a/61901561`}</span>{`
    model`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`zero_grad`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
    `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# reset our loss and batch tally`}</span>{`
    loss `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`0`}</span>{`
    batches_processed `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`0`}</span></code></pre></div>
    <h4 {...{
      "id": "layer-normalization",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h4" {...{
        "href": "#layer-normalization",
        "aria-label": "layer normalization permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Layer normalization`}</h4>
    <p>{`Normalizing the parameters of each layer can help to improve learning speed and stability.`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token keyword"
          }}>{`class`}</span>{` `}<span parentName="code" {...{
            "className": "token class-name"
          }}>{`MyAwesomeNetwork`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`nn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`Module`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`

  `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`def`}</span>{` `}<span parentName="code" {...{
            "className": "token function"
          }}>{`__init__`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`self`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` input_size`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` `}<span parentName="code" {...{
            "className": "token builtin"
          }}>{`int`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` hidden_size`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` `}<span parentName="code" {...{
            "className": "token builtin"
          }}>{`int`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
    `}<span parentName="code" {...{
            "className": "token builtin"
          }}>{`super`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`MyAwesomeNetwork`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` self`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`__init__`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

    self`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`architecture `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` nn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`Sequential`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`
      nn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`Linear`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`input_size`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` hidden_size`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
      `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# normalize weights after affine transformation`}</span>{`
      nn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`LayerNorm`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`hidden_size`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
      `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# non-saturating activation function`}</span>{`
      nn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`ReLU`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
      `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# rinse and repeat for later layers`}</span>{`
      nn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`Linear`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`hidden_size`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` hidden_size`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
      `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# normalize weights after affine transformation`}</span>{`
      nn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`LayerNorm`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`hidden_size`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
      `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# classification layer `}</span>{`
      `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# (assuming a binary classification problem)`}</span>{`
      nn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`Sigmoid`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
    `}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`def`}</span>{` `}<span parentName="code" {...{
            "className": "token function"
          }}>{`forward`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`self`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` x`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` torch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`Tensor`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{` `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`-`}</span><span parentName="code" {...{
            "className": "token operator"
          }}>{`>`}</span>{` torch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`Tensor`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{`
    `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`return`}</span>{` self`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`architecture`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`x`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <h4 {...{
      "id": "l2-regularization",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h4" {...{
        "href": "#l2-regularization",
        "aria-label": "l2 regularization permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`L2 regularization`}</h4>
    <p>{`We can add a penalty term (also called a regularizer) to the loss to encourage the model to limit the magnitude of weights:`}</p>
    <p><span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`L`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`(`}</mo><mi parentName="mrow">{`f`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`(`}</mo><mi parentName="mrow">{`x`}</mi><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`;`}</mo><mi parentName="mrow">{`θ`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`)`}</mo><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`,`}</mo><mi parentName="mrow">{`y`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`)`}</mo><mo parentName="mrow">{`+`}</mo><mi parentName="mrow">{`λ`}</mi><msub parentName="mrow"><mi parentName="msub" {...{
                      "mathvariant": "normal"
                    }}>{`Σ`}</mi><mi parentName="msub">{`i`}</mi></msub><msubsup parentName="mrow"><mi parentName="msubsup">{`w`}</mi><mi parentName="msubsup">{`i`}</mi><mn parentName="msubsup">{`2`}</mn></msubsup></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`L(f(x; \\theta), y) + \\lambda\\Sigma_i w_{i}^{2}`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1em",
                  "verticalAlign": "-0.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`L`}</span><span parentName="span" {...{
                "className": "mopen"
              }}>{`(`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.10764em"
                }
              }}>{`f`}</span><span parentName="span" {...{
                "className": "mopen"
              }}>{`(`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`x`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`;`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.02778em"
                }
              }}>{`θ`}</span><span parentName="span" {...{
                "className": "mclose"
              }}>{`)`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`,`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03588em"
                }
              }}>{`y`}</span><span parentName="span" {...{
                "className": "mclose"
              }}>{`)`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2222em"
                }
              }}></span><span parentName="span" {...{
                "className": "mbin"
              }}>{`+`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2222em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1.0728em",
                  "verticalAlign": "-0.2587em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`λ`}</span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mord"
                }}>{`Σ`}</span><span parentName="span" {...{
                  "className": "msupsub"
                }}><span parentName="span" {...{
                    "className": "vlist-t vlist-t2"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.3117em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-2.55em",
                            "marginLeft": "0em",
                            "marginRight": "0.05em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "2.7em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mathnormal mtight"
                            }}>{`i`}</span></span></span></span><span parentName="span" {...{
                        "className": "vlist-s"
                      }}>{`​`}</span></span><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.15em"
                        }
                      }}><span parentName="span"></span></span></span></span></span></span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mord mathnormal",
                  "style": {
                    "marginRight": "0.02691em"
                  }
                }}>{`w`}</span><span parentName="span" {...{
                  "className": "msupsub"
                }}><span parentName="span" {...{
                    "className": "vlist-t vlist-t2"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.8141em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-2.4413em",
                            "marginLeft": "-0.0269em",
                            "marginRight": "0.05em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "2.7em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mathnormal mtight"
                              }}>{`i`}</span></span></span></span><span parentName="span" {...{
                          "style": {
                            "top": "-3.063em",
                            "marginRight": "0.05em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "2.7em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mtight"
                              }}>{`2`}</span></span></span></span></span><span parentName="span" {...{
                        "className": "vlist-s"
                      }}>{`​`}</span></span><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.2587em"
                        }
                      }}><span parentName="span"></span></span></span></span></span></span></span></span></span></span></p>
    <h2 {...{
      "id": "vanishing-gradients",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#vanishing-gradients",
        "aria-label": "vanishing gradients permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Vanishing gradients`}</h2>
    <p>{`When applying backpropagation to deep neural networks, the gradient generally decreases as we approach the input.  In some cases, the gradient may approach zero for the parameters of early layers.  Recall that when the gradient is zero, we can no longer update our parameters using gradient descent.`}</p>
    <h3 {...{
      "id": "signs-of-vanishing-gradients",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#signs-of-vanishing-gradients",
        "aria-label": "signs of vanishing gradients permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Signs of vanishing gradients`}</h3>
    <ul>
      <li parentName="ul">{`the model trains very slowly (the loss stops decreasing or slows to a crawl)`}</li>
      <li parentName="ul">{`the gradient for early layers is zero while remaining large for late layers`}</li>
      <li parentName="ul">{`weights for a layer are all zero`}</li>
    </ul>
    <h3 {...{
      "id": "causes-of-vanishing-gradients",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#causes-of-vanishing-gradients",
        "aria-label": "causes of vanishing gradients permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Causes of vanishing gradients`}</h3>
    <p>{`We've spent quite a bit of timing discussing the sigmoid activation function. The sigmoid takes some input and produces a value between zero and 1 `}<strong parentName="p">{`no matter the magnitude of that input`}</strong>{`.  When a neuron outputs a value close to either end of some bounded range, it is described as `}<strong parentName="p">{`saturated`}</strong>{`.  Activation functions that exhibit this property are sometimes described as "saturating non-linearities".  Using saturating activation functions in early layers of a deep neural network can lead to vanishing gradients by dampening the signal we rely on for backpropagation.  Luckily, there are many alternative activation functions that do not have this problem.`}</p>
    <h3 {...{
      "id": "avoiding-vanishing-gradients",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#avoiding-vanishing-gradients",
        "aria-label": "avoiding vanishing gradients permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Avoiding vanishing gradients`}</h3>
    <h4 {...{
      "id": "non-saturating-activation-functions",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h4" {...{
        "href": "#non-saturating-activation-functions",
        "aria-label": "non saturating activation functions permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Non-saturating activation functions`}</h4>
    <p>{`One of the mostly commonly used activation functions in deep learning architectures for NLP is the `}<strong parentName="p">{`rectified linear unit (ReLU)`}</strong>{` which constrains only the lower bound of the input to be zero:`}</p>
    <p><span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mtext parentName="mrow">{`ReLU`}</mtext><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`(`}</mo><mi parentName="mrow">{`z`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`)`}</mo><mo parentName="mrow">{`=`}</mo><mtext parentName="mrow">{`max`}</mtext><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`(`}</mo><mn parentName="mrow">{`0`}</mn><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`,`}</mo><mi parentName="mrow">{`z`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`)`}</mo></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\text{ReLU}(z) = \\text{max}(0, z)`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1em",
                  "verticalAlign": "-0.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord text"
              }}><span parentName="span" {...{
                  "className": "mord"
                }}>{`ReLU`}</span></span><span parentName="span" {...{
                "className": "mopen"
              }}>{`(`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.04398em"
                }
              }}>{`z`}</span><span parentName="span" {...{
                "className": "mclose"
              }}>{`)`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`=`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1em",
                  "verticalAlign": "-0.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord text"
              }}><span parentName="span" {...{
                  "className": "mord"
                }}>{`max`}</span></span><span parentName="span" {...{
                "className": "mopen"
              }}>{`(`}</span><span parentName="span" {...{
                "className": "mord"
              }}>{`0`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`,`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.04398em"
                }
              }}>{`z`}</span><span parentName="span" {...{
                "className": "mclose"
              }}>{`)`}</span></span></span></span></span></p>
    <div {...{
      "className": "admonition admonition-warning alert alert--danger"
    }}><div parentName="div" {...{
        "className": "admonition-heading"
      }}><h5 parentName="div"><span parentName="h5" {...{
            "className": "admonition-icon"
          }}><svg parentName="span" {...{
              "xmlns": "http://www.w3.org/2000/svg",
              "width": "12",
              "height": "16",
              "viewBox": "0 0 12 16"
            }}><path parentName="svg" {...{
                "fillRule": "evenodd",
                "d": "M5.05.31c.81 2.17.41 3.38-.52 4.31C3.55 5.67 1.98 6.45.9 7.98c-1.45 2.05-1.7 6.53 3.53 7.7-2.2-1.16-2.67-4.52-.3-6.61-.61 2.03.53 3.33 1.94 2.86 1.39-.47 2.3.53 2.27 1.67-.02.78-.31 1.44-1.13 1.81 3.42-.59 4.78-3.42 4.78-5.56 0-2.84-2.53-3.22-1.25-5.61-1.52.13-2.03 1.13-1.89 2.75.09 1.08-1.02 1.8-1.86 1.33-.67-.41-.66-1.19-.06-1.78C8.18 5.31 8.68 2.45 5.05.32L5.03.3l.02.01z"
              }}></path></svg></span><undefined parentName="h5">{`dead neurons `}<span {...{
              "role": "img",
              "aria-label": "skull"
            }}>{`💀`}</span></undefined></h5></div><div parentName="div" {...{
        "className": "admonition-content"
      }}><p parentName="div">{`ReLUs cannot learn from negative values.  In such cases, they will only output zero. One variant of the ReLU designed to overcome this problem is the `}<strong parentName="p">{`leaky ReLU`}</strong><undefined parentName="p">{` `}<span {...{
              "role": "img",
              "aria-label": "sweat droplets"
            }}>{`💦`}</span>{`:`}</undefined></p><p parentName="div">{`f(x) `}<span parentName="p" {...{
            "className": "math math-inline"
          }}><span parentName="span" {...{
              "className": "katex"
            }}><span parentName="span" {...{
                "className": "katex-mathml"
              }}><math parentName="span" {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML"
                }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow" {...{
                        "fence": "true"
                      }}>{`{`}</mo><mtable parentName="mrow" {...{
                        "rowspacing": "0.36em",
                        "columnalign": "left left",
                        "columnspacing": "1em"
                      }}><mtr parentName="mtable"><mtd parentName="mtr"><mstyle parentName="mtd" {...{
                              "scriptlevel": "0",
                              "displaystyle": "false"
                            }}><mi parentName="mstyle">{`x`}</mi></mstyle></mtd><mtd parentName="mtr"><mstyle parentName="mtd" {...{
                              "scriptlevel": "0",
                              "displaystyle": "false"
                            }}><mrow parentName="mstyle"><mtext parentName="mrow">{`if `}</mtext><mi parentName="mrow">{`x`}</mi><mo parentName="mrow">{`>`}</mo><mn parentName="mrow">{`0`}</mn></mrow></mstyle></mtd></mtr><mtr parentName="mtable"><mtd parentName="mtr"><mstyle parentName="mtd" {...{
                              "scriptlevel": "0",
                              "displaystyle": "false"
                            }}><mrow parentName="mstyle"><mi parentName="mrow">{`γ`}</mi><mi parentName="mrow">{`x`}</mi></mrow></mstyle></mtd><mtd parentName="mtr"><mstyle parentName="mtd" {...{
                              "scriptlevel": "0",
                              "displaystyle": "false"
                            }}><mtext parentName="mstyle">{`otherwise `}</mtext></mstyle></mtd></mtr></mtable></mrow><annotation parentName="semantics" {...{
                      "encoding": "application/x-tex"
                    }}>{`\\begin{cases}
   x &\\text{if } x > 0 \\\\
   \\gamma x &\\text{otherwise }
\\end{cases}`}</annotation></semantics></math></span><span parentName="span" {...{
                "className": "katex-html",
                "aria-hidden": "true"
              }}><span parentName="span" {...{
                  "className": "base"
                }}><span parentName="span" {...{
                    "className": "strut",
                    "style": {
                      "height": "3em",
                      "verticalAlign": "-1.25em"
                    }
                  }}></span><span parentName="span" {...{
                    "className": "minner"
                  }}><span parentName="span" {...{
                      "className": "mopen delimcenter",
                      "style": {
                        "top": "0em"
                      }
                    }}><span parentName="span" {...{
                        "className": "delimsizing size4"
                      }}>{`{`}</span></span><span parentName="span" {...{
                      "className": "mord"
                    }}><span parentName="span" {...{
                        "className": "mtable"
                      }}><span parentName="span" {...{
                          "className": "col-align-l"
                        }}><span parentName="span" {...{
                            "className": "vlist-t vlist-t2"
                          }}><span parentName="span" {...{
                              "className": "vlist-r"
                            }}><span parentName="span" {...{
                                "className": "vlist",
                                "style": {
                                  "height": "1.69em"
                                }
                              }}><span parentName="span" {...{
                                  "style": {
                                    "top": "-3.69em"
                                  }
                                }}><span parentName="span" {...{
                                    "className": "pstrut",
                                    "style": {
                                      "height": "3.008em"
                                    }
                                  }}></span><span parentName="span" {...{
                                    "className": "mord"
                                  }}><span parentName="span" {...{
                                      "className": "mord mathnormal"
                                    }}>{`x`}</span></span></span><span parentName="span" {...{
                                  "style": {
                                    "top": "-2.25em"
                                  }
                                }}><span parentName="span" {...{
                                    "className": "pstrut",
                                    "style": {
                                      "height": "3.008em"
                                    }
                                  }}></span><span parentName="span" {...{
                                    "className": "mord"
                                  }}><span parentName="span" {...{
                                      "className": "mord mathnormal",
                                      "style": {
                                        "marginRight": "0.05556em"
                                      }
                                    }}>{`γ`}</span><span parentName="span" {...{
                                      "className": "mord mathnormal"
                                    }}>{`x`}</span></span></span></span><span parentName="span" {...{
                                "className": "vlist-s"
                              }}>{`​`}</span></span><span parentName="span" {...{
                              "className": "vlist-r"
                            }}><span parentName="span" {...{
                                "className": "vlist",
                                "style": {
                                  "height": "1.19em"
                                }
                              }}><span parentName="span"></span></span></span></span></span><span parentName="span" {...{
                          "className": "arraycolsep",
                          "style": {
                            "width": "1em"
                          }
                        }}></span><span parentName="span" {...{
                          "className": "col-align-l"
                        }}><span parentName="span" {...{
                            "className": "vlist-t vlist-t2"
                          }}><span parentName="span" {...{
                              "className": "vlist-r"
                            }}><span parentName="span" {...{
                                "className": "vlist",
                                "style": {
                                  "height": "1.69em"
                                }
                              }}><span parentName="span" {...{
                                  "style": {
                                    "top": "-3.69em"
                                  }
                                }}><span parentName="span" {...{
                                    "className": "pstrut",
                                    "style": {
                                      "height": "3.008em"
                                    }
                                  }}></span><span parentName="span" {...{
                                    "className": "mord"
                                  }}><span parentName="span" {...{
                                      "className": "mord text"
                                    }}><span parentName="span" {...{
                                        "className": "mord"
                                      }}>{`if `}</span></span><span parentName="span" {...{
                                      "className": "mord mathnormal"
                                    }}>{`x`}</span><span parentName="span" {...{
                                      "className": "mspace",
                                      "style": {
                                        "marginRight": "0.2778em"
                                      }
                                    }}></span><span parentName="span" {...{
                                      "className": "mrel"
                                    }}>{`>`}</span><span parentName="span" {...{
                                      "className": "mspace",
                                      "style": {
                                        "marginRight": "0.2778em"
                                      }
                                    }}></span><span parentName="span" {...{
                                      "className": "mord"
                                    }}>{`0`}</span></span></span><span parentName="span" {...{
                                  "style": {
                                    "top": "-2.25em"
                                  }
                                }}><span parentName="span" {...{
                                    "className": "pstrut",
                                    "style": {
                                      "height": "3.008em"
                                    }
                                  }}></span><span parentName="span" {...{
                                    "className": "mord"
                                  }}><span parentName="span" {...{
                                      "className": "mord text"
                                    }}><span parentName="span" {...{
                                        "className": "mord"
                                      }}>{`otherwise `}</span></span></span></span></span><span parentName="span" {...{
                                "className": "vlist-s"
                              }}>{`​`}</span></span><span parentName="span" {...{
                              "className": "vlist-r"
                            }}><span parentName="span" {...{
                                "className": "vlist",
                                "style": {
                                  "height": "1.19em"
                                }
                              }}><span parentName="span"></span></span></span></span></span></span></span><span parentName="span" {...{
                      "className": "mclose nulldelimiter"
                    }}></span></span></span></span></span></span></p><p parentName="div">{`where `}<span parentName="p" {...{
            "className": "math math-inline"
          }}><span parentName="span" {...{
              "className": "katex"
            }}><span parentName="span" {...{
                "className": "katex-mathml"
              }}><math parentName="span" {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML"
                }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`γ`}</mi></mrow><annotation parentName="semantics" {...{
                      "encoding": "application/x-tex"
                    }}>{`\\gamma`}</annotation></semantics></math></span><span parentName="span" {...{
                "className": "katex-html",
                "aria-hidden": "true"
              }}><span parentName="span" {...{
                  "className": "base"
                }}><span parentName="span" {...{
                    "className": "strut",
                    "style": {
                      "height": "0.625em",
                      "verticalAlign": "-0.1944em"
                    }
                  }}></span><span parentName="span" {...{
                    "className": "mord mathnormal",
                    "style": {
                      "marginRight": "0.05556em"
                    }
                  }}>{`γ`}</span></span></span></span></span>{` is some small value (ex. 0.001).`}</p></div></div>
    <h4 {...{
      "id": "shallower-networks",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h4" {...{
        "href": "#shallower-networks",
        "aria-label": "shallower networks permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Shallower networks`}</h4>
    <p>{`Reducing the number of layers may also help to avoid vanishing gradients.`}</p>
    <h2 {...{
      "id": "exploding-gradients",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#exploding-gradients",
        "aria-label": "exploding gradients permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Exploding gradients`}</h2>
    <p>{`In certain architectures and problems, one can encounter very large gradients where values approach positive infinity (ex. multiplying a series of gradients with values larger than 1).  This results in large updates to parameters and model instability.`}</p>
    <h3 {...{
      "id": "signs-of-exploding-gradients",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#signs-of-exploding-gradients",
        "aria-label": "signs of exploding gradients permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Signs of exploding gradients`}</h3>
    <ul>
      <li parentName="ul">{`parameters grow exponentially`}</li>
      <li parentName="ul">{`parameters become NaN (approach positive or negative infinity)`}</li>
      <li parentName="ul">{`loss becomes NaN (approach positive or negative infinity)`}</li>
      <li parentName="ul">{`large swings in loss (instability)`}</li>
    </ul>
    <h3 {...{
      "id": "avoiding-exploding-gradients",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#avoiding-exploding-gradients",
        "aria-label": "avoiding exploding gradients permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Avoiding exploding gradients`}</h3>
    <p>{`The most effective means of addressing exploding gradients is `}<strong parentName="p">{`gradient clipping`}</strong>{`.`}</p>
    <h4 {...{
      "id": "gradient-clipping",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h4" {...{
        "href": "#gradient-clipping",
        "aria-label": "gradient clipping permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Gradient clipping`}</h4>
    <p>{`"Clip" (or alternatively scale using `}<strong parentName="p">{`gradient scaling`}</strong>{`) a gradients `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`g`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`g`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.625em",
                  "verticalAlign": "-0.1944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03588em"
                }
              }}>{`g`}</span></span></span></span></span>{` whenever they exceed some threshold (`}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`γ`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\gamma`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.625em",
                  "verticalAlign": "-0.1944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.05556em"
                }
              }}>{`γ`}</span></span></span></span></span>{`). One typical version of this is to clip by the 2-norm of the gradient:`}</p>
    <div {...{
      "className": "math math-display"
    }}><span parentName="div" {...{
        "className": "katex-display"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML",
              "display": "block"
            }}><semantics parentName="math"><mrow parentName="semantics"><mo parentName="mrow" {...{
                    "fence": "true"
                  }}>{`{`}</mo><mtable parentName="mrow" {...{
                    "rowspacing": "0.36em",
                    "columnalign": "left left",
                    "columnspacing": "1em"
                  }}><mtr parentName="mtable"><mtd parentName="mtr"><mstyle parentName="mtd" {...{
                          "scriptlevel": "0",
                          "displaystyle": "false"
                        }}><mrow parentName="mstyle"><mi parentName="mrow">{`γ`}</mi><mfrac parentName="mrow"><mi parentName="mfrac">{`g`}</mi><mrow parentName="mfrac"><mi parentName="mrow" {...{
                                  "mathvariant": "normal"
                                }}>{`∣`}</mi><mi parentName="mrow" {...{
                                  "mathvariant": "normal"
                                }}>{`∣`}</mi><mi parentName="mrow">{`g`}</mi><mi parentName="mrow" {...{
                                  "mathvariant": "normal"
                                }}>{`∣`}</mi><mi parentName="mrow" {...{
                                  "mathvariant": "normal"
                                }}>{`∣`}</mi></mrow></mfrac></mrow></mstyle></mtd><mtd parentName="mtr"><mstyle parentName="mtd" {...{
                          "scriptlevel": "0",
                          "displaystyle": "false"
                        }}><mrow parentName="mstyle"><mtext parentName="mrow">{`if `}</mtext><mi parentName="mrow">{`g`}</mi><mo parentName="mrow">{`>`}</mo><mi parentName="mrow">{`γ`}</mi></mrow></mstyle></mtd></mtr><mtr parentName="mtable"><mtd parentName="mtr"><mstyle parentName="mtd" {...{
                          "scriptlevel": "0",
                          "displaystyle": "false"
                        }}><mi parentName="mstyle">{`g`}</mi></mstyle></mtd><mtd parentName="mtr"><mstyle parentName="mtd" {...{
                          "scriptlevel": "0",
                          "displaystyle": "false"
                        }}><mtext parentName="mstyle">{`otherwise `}</mtext></mstyle></mtd></mtr></mtable></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\begin{cases}
   \\gamma\\frac{g}{\\vert \\vert g \\vert \\vert} &\\text{if } g > \\gamma \\\\
   g &\\text{otherwise }
\\end{cases}`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "3em",
                  "verticalAlign": "-1.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "minner"
              }}><span parentName="span" {...{
                  "className": "mopen delimcenter",
                  "style": {
                    "top": "0em"
                  }
                }}><span parentName="span" {...{
                    "className": "delimsizing size4"
                  }}>{`{`}</span></span><span parentName="span" {...{
                  "className": "mord"
                }}><span parentName="span" {...{
                    "className": "mtable"
                  }}><span parentName="span" {...{
                      "className": "col-align-l"
                    }}><span parentName="span" {...{
                        "className": "vlist-t vlist-t2"
                      }}><span parentName="span" {...{
                          "className": "vlist-r"
                        }}><span parentName="span" {...{
                            "className": "vlist",
                            "style": {
                              "height": "1.734em"
                            }
                          }}><span parentName="span" {...{
                              "style": {
                                "top": "-3.734em"
                              }
                            }}><span parentName="span" {...{
                                "className": "pstrut",
                                "style": {
                                  "height": "3.008em"
                                }
                              }}></span><span parentName="span" {...{
                                "className": "mord"
                              }}><span parentName="span" {...{
                                  "className": "mord mathnormal",
                                  "style": {
                                    "marginRight": "0.05556em"
                                  }
                                }}>{`γ`}</span><span parentName="span" {...{
                                  "className": "mord"
                                }}><span parentName="span" {...{
                                    "className": "mopen nulldelimiter"
                                  }}></span><span parentName="span" {...{
                                    "className": "mfrac"
                                  }}><span parentName="span" {...{
                                      "className": "vlist-t vlist-t2"
                                    }}><span parentName="span" {...{
                                        "className": "vlist-r"
                                      }}><span parentName="span" {...{
                                          "className": "vlist",
                                          "style": {
                                            "height": "0.7475em"
                                          }
                                        }}><span parentName="span" {...{
                                            "style": {
                                              "top": "-2.655em"
                                            }
                                          }}><span parentName="span" {...{
                                              "className": "pstrut",
                                              "style": {
                                                "height": "3em"
                                              }
                                            }}></span><span parentName="span" {...{
                                              "className": "sizing reset-size6 size3 mtight"
                                            }}><span parentName="span" {...{
                                                "className": "mord mtight"
                                              }}><span parentName="span" {...{
                                                  "className": "mord mtight"
                                                }}>{`∣∣`}</span><span parentName="span" {...{
                                                  "className": "mord mathnormal mtight",
                                                  "style": {
                                                    "marginRight": "0.03588em"
                                                  }
                                                }}>{`g`}</span><span parentName="span" {...{
                                                  "className": "mord mtight"
                                                }}>{`∣∣`}</span></span></span></span><span parentName="span" {...{
                                            "style": {
                                              "top": "-3.23em"
                                            }
                                          }}><span parentName="span" {...{
                                              "className": "pstrut",
                                              "style": {
                                                "height": "3em"
                                              }
                                            }}></span><span parentName="span" {...{
                                              "className": "frac-line",
                                              "style": {
                                                "borderBottomWidth": "0.04em"
                                              }
                                            }}></span></span><span parentName="span" {...{
                                            "style": {
                                              "top": "-3.4461em"
                                            }
                                          }}><span parentName="span" {...{
                                              "className": "pstrut",
                                              "style": {
                                                "height": "3em"
                                              }
                                            }}></span><span parentName="span" {...{
                                              "className": "sizing reset-size6 size3 mtight"
                                            }}><span parentName="span" {...{
                                                "className": "mord mtight"
                                              }}><span parentName="span" {...{
                                                  "className": "mord mathnormal mtight",
                                                  "style": {
                                                    "marginRight": "0.03588em"
                                                  }
                                                }}>{`g`}</span></span></span></span></span><span parentName="span" {...{
                                          "className": "vlist-s"
                                        }}>{`​`}</span></span><span parentName="span" {...{
                                        "className": "vlist-r"
                                      }}><span parentName="span" {...{
                                          "className": "vlist",
                                          "style": {
                                            "height": "0.52em"
                                          }
                                        }}><span parentName="span"></span></span></span></span></span><span parentName="span" {...{
                                    "className": "mclose nulldelimiter"
                                  }}></span></span></span></span><span parentName="span" {...{
                              "style": {
                                "top": "-2.206em"
                              }
                            }}><span parentName="span" {...{
                                "className": "pstrut",
                                "style": {
                                  "height": "3.008em"
                                }
                              }}></span><span parentName="span" {...{
                                "className": "mord"
                              }}><span parentName="span" {...{
                                  "className": "mord mathnormal",
                                  "style": {
                                    "marginRight": "0.03588em"
                                  }
                                }}>{`g`}</span></span></span></span><span parentName="span" {...{
                            "className": "vlist-s"
                          }}>{`​`}</span></span><span parentName="span" {...{
                          "className": "vlist-r"
                        }}><span parentName="span" {...{
                            "className": "vlist",
                            "style": {
                              "height": "1.234em"
                            }
                          }}><span parentName="span"></span></span></span></span></span><span parentName="span" {...{
                      "className": "arraycolsep",
                      "style": {
                        "width": "1em"
                      }
                    }}></span><span parentName="span" {...{
                      "className": "col-align-l"
                    }}><span parentName="span" {...{
                        "className": "vlist-t vlist-t2"
                      }}><span parentName="span" {...{
                          "className": "vlist-r"
                        }}><span parentName="span" {...{
                            "className": "vlist",
                            "style": {
                              "height": "1.734em"
                            }
                          }}><span parentName="span" {...{
                              "style": {
                                "top": "-3.734em"
                              }
                            }}><span parentName="span" {...{
                                "className": "pstrut",
                                "style": {
                                  "height": "3.008em"
                                }
                              }}></span><span parentName="span" {...{
                                "className": "mord"
                              }}><span parentName="span" {...{
                                  "className": "mord text"
                                }}><span parentName="span" {...{
                                    "className": "mord"
                                  }}>{`if `}</span></span><span parentName="span" {...{
                                  "className": "mord mathnormal",
                                  "style": {
                                    "marginRight": "0.03588em"
                                  }
                                }}>{`g`}</span><span parentName="span" {...{
                                  "className": "mspace",
                                  "style": {
                                    "marginRight": "0.2778em"
                                  }
                                }}></span><span parentName="span" {...{
                                  "className": "mrel"
                                }}>{`>`}</span><span parentName="span" {...{
                                  "className": "mspace",
                                  "style": {
                                    "marginRight": "0.2778em"
                                  }
                                }}></span><span parentName="span" {...{
                                  "className": "mord mathnormal",
                                  "style": {
                                    "marginRight": "0.05556em"
                                  }
                                }}>{`γ`}</span></span></span><span parentName="span" {...{
                              "style": {
                                "top": "-2.206em"
                              }
                            }}><span parentName="span" {...{
                                "className": "pstrut",
                                "style": {
                                  "height": "3.008em"
                                }
                              }}></span><span parentName="span" {...{
                                "className": "mord"
                              }}><span parentName="span" {...{
                                  "className": "mord text"
                                }}><span parentName="span" {...{
                                    "className": "mord"
                                  }}>{`otherwise `}</span></span></span></span></span><span parentName="span" {...{
                            "className": "vlist-s"
                          }}>{`​`}</span></span><span parentName="span" {...{
                          "className": "vlist-r"
                        }}><span parentName="span" {...{
                            "className": "vlist",
                            "style": {
                              "height": "1.234em"
                            }
                          }}><span parentName="span"></span></span></span></span></span></span></span><span parentName="span" {...{
                  "className": "mclose nulldelimiter"
                }}></span></span></span></span></span></span></div>
    <p>{`In PyTorch, we would use `}<a parentName="p" {...{
        "href": "https://pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html",
        "target": "_self",
        "rel": "nofollow"
      }}><code parentName="a" {...{
          "className": "language-text"
        }}>{`torch.nn.utils.clip_grad_norm_`}</code></a>{`:`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` torch

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# perform backward pass/calculate gradients`}</span>{`
loss`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`backward`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

threshold `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`1`}</span>{`
torch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`nn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`utils`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`clip_grad_norm_`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`
  parameters`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`model`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`parameters`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
  max_norm`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`threshold`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
  norm_type`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`2.`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# update parameters`}</span>{`
optimizer`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`step`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <h1 {...{
      "id": "not-enough-data",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#not-enough-data",
        "aria-label": "not enough data permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Not enough data`}</h1>
    <p>{`If labeled data is in short supply and expensive to supplement, you may struggle to train a deep neural network with many parameters.  This can be mitigated through techniques like data augmentation (techniques for creating new datapoints from existing ones), multi-task learning (MTL), transfer learning, and fine tuning.  We'll postpone in-depth discussion of most of these until later, but the key observation behind multi-task learning, transfer learning, and fine-tuning is that we can often repurpose a network trained for a different but related task by replacing the final layer (and potentially adding some layers between the classification).  Why does this work? Deep neural networks are able to learn progressively more task-specific features from general ones. Early layers may contain information applicable to many different tasks.`}</p>
    <h3 {...{
      "id": "pre-trained-embeddings",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#pre-trained-embeddings",
        "aria-label": "pre trained embeddings permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Pre-trained embeddings`}</h3>
    <p>{`For neural network architectures using static word embeddings that are parameters of the model, we can initialize the value of those parameters using pre-trained word embeddings (ex. GloVe, skip-gram, etc.):`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` torch
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` torch `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` nn

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# imagine these are loaded from numpy`}</span>{`
pretrained_embeddings`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` torch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`FloatTensor `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` torch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`from_numpy`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`my_embeddings`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# see https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html`}</span>{`
embeddings `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` nn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`Embedding`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`from_pretrained`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# imagine these are loaded from GloVe, etc.`}</span>{`
  pretrained_embeddings`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
  `}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# allow these to be adjusted through further training`}</span>{`
  freeze`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`False`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# get the embedding for the word (or char embedding, pos tag embedding, etc) with index 1`}</span>{`
idx `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`1`}</span>{`
embedding_idx `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` torch`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`LongTensor`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span>{`idx`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
embeddings`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`embedding_idx`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <p>{`This allows us to jump-start the learning process with a set of parameters that already encode useful information about language.`}</p>
    <p>{`Many algorithms exist for learning embeddings in a self-supervised way (cf. skip-gram, CBOW). `}</p>
    <h2 {...{
      "id": "tokenization-strategies",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#tokenization-strategies",
        "aria-label": "tokenization strategies permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Tokenization strategies`}</h2>
    <p>{`If we don't have enough data to learn good values for a deep neural network with a large number of parameters, we can try reducing the number of parameters.  Aside from earlier discussions about removing hidden layers and reducing their size, we can also think about reducing the vocabulary size for our input.  Rather than trying to learn a representation of every word in our vocabulary, we can learn representations for smaller units (sub-words) which will reduce our in turn reduce our vocabulary size (many words share sub-word sequences). On one extreme, we can tokenize into individual characters (or even bytes), but there are `}<a parentName="p" {...{
        "href": "https://github.com/google/sentencepiece",
        "target": "_self",
        "rel": "nofollow"
      }}>{`other tokenization stategies that afford a compromise`}</a>{`.`}</p>
    <h1 {...{
      "id": "other-tricks",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#other-tricks",
        "aria-label": "other tricks permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Other tricks`}</h1>
    <p>{`This list of tips and tricks is by no means comprehensive. Here are a few other tricks to improve learning with deep neural networks.`}</p>
    {
      /* ### Skip connections
      https://pytorch.org/docs/stable/pipeline.html#skip-connections */
    }
    {
      /* 
      ## Multi-task learning
      Imagine you are classifying the sentiment of book reviews. In this hypothetical scenario,  you have thousands of unlabeled book reviews, but only a few dozen labeled ones. In addition to this corpus, you a list of negative sentiment words.
      We might first train a deep neural network to predict which words in some document are negative sentiment words.  Knowing which words are negative sentiment words might help us predict book review.
      
      Multi-task learning (MTL) is a way of x.
      Share parameters across multiple tasks.
      Change the classification final layer(s) for each task.
      <!-- ## Parameter sharing */
    }
    {
      /* ## Transfer learning
      Just like with Multi-task learning, we can often re-use a network trained for one task on another task by replacing the final layer (and potentially adding).  If we think of a deep neural network as learning progressively more task-specific features from general ones, the early layers may contain information applicable to many different tasks.
      We'll discuss this idea in more depth when we explore transformers.
      ### Gradual unfreezing */
    }
    <h2 {...{
      "id": "optimizers",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#optimizers",
        "aria-label": "optimizers permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Optimizers`}</h2>
    <p><strong parentName="p">{`With the right tuning`}</strong>{` of hyperparameters, many optimizers will yield equivalent performance. Adam has consistently shown to have excellent out-of-the-box performance for many NLP tasks.`}</p>
    <h2 {...{
      "id": "shuffle-every-epoch",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#shuffle-every-epoch",
        "aria-label": "shuffle every epoch permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Shuffle every epoch`}</h2>
    <p>{`Datapoints within a batch may have high variance/competing gradients.  To avoid seeing the same batches (subgroups of datapoints) in each epoch, shuffle your datapoints for each epoch.`}</p>
    <h2 {...{
      "id": "residual-connections",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#residual-connections",
        "aria-label": "residual connections permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Residual connections`}</h2>
    <p>{`For deep neural networks, we can improve signal for backpropogation by adding `}<strong parentName="p">{`residual connections`}</strong>{` that directly link earlier layers with later layers: `}</p>
    <p>{`For input at layer `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`t`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`t`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6151em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span></span></span></span></span>{` (`}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><msub parentName="mrow"><mi parentName="msub">{`x`}</mi><mi parentName="msub">{`t`}</mi></msub></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`x_{t}`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.5806em",
                  "verticalAlign": "-0.15em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mord mathnormal"
                }}>{`x`}</span><span parentName="span" {...{
                  "className": "msupsub"
                }}><span parentName="span" {...{
                    "className": "vlist-t vlist-t2"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.2806em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-2.55em",
                            "marginLeft": "0em",
                            "marginRight": "0.05em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "2.7em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mathnormal mtight"
                              }}>{`t`}</span></span></span></span></span><span parentName="span" {...{
                        "className": "vlist-s"
                      }}>{`​`}</span></span><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.15em"
                        }
                      }}><span parentName="span"></span></span></span></span></span></span></span></span></span></span>{`) we simply add the input for some earlier layer (`}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><msub parentName="mrow"><mi parentName="msub">{`x`}</mi><mrow parentName="msub"><mi parentName="mrow">{`t`}</mi><mo parentName="mrow">{`−`}</mo><mi parentName="mrow">{`n`}</mi></mrow></msub></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`x_{t-n}`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6389em",
                  "verticalAlign": "-0.2083em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mord mathnormal"
                }}>{`x`}</span><span parentName="span" {...{
                  "className": "msupsub"
                }}><span parentName="span" {...{
                    "className": "vlist-t vlist-t2"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.2806em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-2.55em",
                            "marginLeft": "0em",
                            "marginRight": "0.05em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "2.7em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mathnormal mtight"
                              }}>{`t`}</span><span parentName="span" {...{
                                "className": "mbin mtight"
                              }}>{`−`}</span><span parentName="span" {...{
                                "className": "mord mathnormal mtight"
                              }}>{`n`}</span></span></span></span></span><span parentName="span" {...{
                        "className": "vlist-s"
                      }}>{`​`}</span></span><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.2083em"
                        }
                      }}><span parentName="span"></span></span></span></span></span></span></span></span></span></span>{`) to the affine transformation:`}</p>
    <p><span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`a`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`(`}</mo><mi parentName="mrow">{`W`}</mi><msub parentName="mrow"><mi parentName="msub">{`x`}</mi><mi parentName="msub">{`t`}</mi></msub><mo parentName="mrow">{`+`}</mo><mi parentName="mrow">{`b`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`)`}</mo><mo parentName="mrow">{`+`}</mo><msub parentName="mrow"><mi parentName="msub">{`x`}</mi><mrow parentName="msub"><mi parentName="mrow">{`t`}</mi><mo parentName="mrow">{`−`}</mo><mi parentName="mrow">{`n`}</mi></mrow></msub></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`a(Wx_{t}+b) + x_{t-{n}}`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1em",
                  "verticalAlign": "-0.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`a`}</span><span parentName="span" {...{
                "className": "mopen"
              }}>{`(`}</span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.13889em"
                }
              }}>{`W`}</span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mord mathnormal"
                }}>{`x`}</span><span parentName="span" {...{
                  "className": "msupsub"
                }}><span parentName="span" {...{
                    "className": "vlist-t vlist-t2"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.2806em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-2.55em",
                            "marginLeft": "0em",
                            "marginRight": "0.05em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "2.7em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mathnormal mtight"
                              }}>{`t`}</span></span></span></span></span><span parentName="span" {...{
                        "className": "vlist-s"
                      }}>{`​`}</span></span><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.15em"
                        }
                      }}><span parentName="span"></span></span></span></span></span></span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2222em"
                }
              }}></span><span parentName="span" {...{
                "className": "mbin"
              }}>{`+`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2222em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1em",
                  "verticalAlign": "-0.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`b`}</span><span parentName="span" {...{
                "className": "mclose"
              }}>{`)`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2222em"
                }
              }}></span><span parentName="span" {...{
                "className": "mbin"
              }}>{`+`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2222em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6389em",
                  "verticalAlign": "-0.2083em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mord mathnormal"
                }}>{`x`}</span><span parentName="span" {...{
                  "className": "msupsub"
                }}><span parentName="span" {...{
                    "className": "vlist-t vlist-t2"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.2806em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-2.55em",
                            "marginLeft": "0em",
                            "marginRight": "0.05em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "2.7em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mathnormal mtight"
                              }}>{`t`}</span><span parentName="span" {...{
                                "className": "mbin mtight"
                              }}>{`−`}</span><span parentName="span" {...{
                                "className": "mord mtight"
                              }}><span parentName="span" {...{
                                  "className": "mord mathnormal mtight"
                                }}>{`n`}</span></span></span></span></span></span><span parentName="span" {...{
                        "className": "vlist-s"
                      }}>{`​`}</span></span><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.2083em"
                        }
                      }}><span parentName="span"></span></span></span></span></span></span></span></span></span></span></p>
    <p>{`Fancier alternatives exist, such as dense connections where each layer is connected to all following layers in a weighted fashion.`}</p>
    <h2 {...{
      "id": "references",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#references",
        "aria-label": "references permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`References`}</h2>
    <p>{`@@bibliography@@
@Book{goldberg2017neural,
author = {Goldberg, Yoav},
title = {Neural Network Methods for Deep Learning},
year = {2017},
publisher = {Morgan & Claypool Publishers},
address = {San Rafael, California},
isbn = {1627052984},
url = {\\url{ebookcentral.proquest.com/lib/uaz/reader.action?docID=4843762}},
}
@misc{ruder2017deeplearningnlp,
author = {Ruder, Sebastian},
title = {Deep Learning for NLP Best Practices},
year = {2017},
howpublished = {\\url{`}<a parentName="p" {...{
        "href": "http://www.ruder.io/deep-learning-nlp-best-practices/%7D%7D",
        "target": "_self",
        "rel": "nofollow"
      }}>{`www.ruder.io/deep-learning-nlp-best-practices/}}`}</a>{`,
url = {\\url{`}<a parentName="p" {...{
        "href": "http://www.ruder.io/deep-learning-nlp-best-practices/%7D%7D",
        "target": "_self",
        "rel": "nofollow"
      }}>{`www.ruder.io/deep-learning-nlp-best-practices/}}`}</a>{`,
}
@@bibliography@@`}</p>

    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      