import * as React from 'react'
  /* @jsx mdx */
import { mdx } from '@mdx-js/react';
/* @jsxRuntime classic */

/* @jsx mdx */

import DefaultLayout from "/home/runner/work/myedibleenso.github.io/myedibleenso.github.io/src/components/BasicLayout.js";
import { HTMLTable } from '@blueprintjs/core';
export const _frontmatter = {};
const layoutProps = {
  _frontmatter
};
const MDXLayout = DefaultLayout;
export default function MDXContent({
  components,
  ...props
}) {
  return <MDXLayout {...layoutProps} {...props} components={components} mdxType="MDXLayout">



    <h2 {...{
      "id": "summary",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#summary",
        "aria-label": "summary permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Summary`}</h2>
    <p>{`Binomial logistic regression using TF-IDF scores of `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-grams (unigrams and bigrams) from the subject and body of emails as features is an efficient and interpretable method of achieving 98% precision and recall in classifying spam messages within an imbalanced dataset.`}</p>
    <p>{`For the provided dataset, deep neural network architectures (ex. transformer-based encoders) provide a negligible improvement at a much higher cost measured in training time and model size (`}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mn parentName="mrow">{`1`}</mn><msup parentName="mrow"><mn parentName="msup">{`0`}</mn><mn parentName="msup">{`4`}</mn></msup><mo parentName="mrow">{`≪`}</mo><mn parentName="mrow">{`1`}</mn><msup parentName="mrow"><mn parentName="msup">{`0`}</mn><mn parentName="msup">{`8`}</mn></msup></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`10^{4} \\ll 10^{8}`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.8532em",
                  "verticalAlign": "-0.0391em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord"
              }}>{`1`}</span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mord"
                }}>{`0`}</span><span parentName="span" {...{
                  "className": "msupsub"
                }}><span parentName="span" {...{
                    "className": "vlist-t"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.8141em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-3.063em",
                            "marginRight": "0.05em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "2.7em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mtight"
                              }}>{`4`}</span></span></span></span></span></span></span></span></span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`≪`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.8141em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord"
              }}>{`1`}</span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mord"
                }}>{`0`}</span><span parentName="span" {...{
                  "className": "msupsub"
                }}><span parentName="span" {...{
                    "className": "vlist-t"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "0.8141em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-3.063em",
                            "marginRight": "0.05em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "2.7em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "sizing reset-size6 size3 mtight"
                          }}><span parentName="span" {...{
                              "className": "mord mtight"
                            }}><span parentName="span" {...{
                                "className": "mord mtight"
                              }}>{`8`}</span></span></span></span></span></span></span></span></span></span></span></span></span>{` parameters).`}</p>
    {
      /* `python -c 'from transformers import AutoModelForSequenceClassification; bert = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased"); print(sum(p.numel() for p in bert.parameters()))'` */
    }
    <h2 {...{
      "id": "problem",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#problem",
        "aria-label": "problem permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Problem`}</h2>
    <p>{`Given a labeled dataset of ~5K emails, our task is to learn to classify spam messages.  The subject and body of each email is provided along with its label (spam vs `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow" {...{
                    "mathvariant": "normal"
                  }}>{`¬`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\neg`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord"
              }}>{`¬`}</span></span></span></span></span>{` spam). Roughly a third our dataset is spam, and no train/test split is provided.`}</p>
    <h2 {...{
      "id": "observations",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#observations",
        "aria-label": "observations permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Observations`}</h2>
    <p>{`Before devising a strategy, some exploratory data analysis is always helpful.  Here are my key observations:`}</p>
    <ul>
      <li parentName="ul"><strong parentName="li">{`Precision is more important than recall in spam detection`}</strong>{`: Allowing the occasional spam message to appear in a user's inbox is less of an issue than inadvertantly sending a legitimate email to a user's spam folder (i.e., false positives are a bigger problem than false negatives).`}</li>
      <li parentName="ul"><strong parentName="li">{`Text has already been casefolded`}</strong>{`:  Both the contents of the subject and email body have been lowercased.  Wordshape features are unlikely to provide much value.`}</li>
      <li parentName="ul"><strong parentName="li">{`No metadata included in the dataset`}</strong>{`:  We cannot devise features around the sender and the sender's relationship with the recipient (ex. have they written back and forth previously?)`}</li>
      <li parentName="ul"><strong parentName="li">{`No HTML in email body`}</strong>{`:  HTML has been stripped from the email contents.  This means we can't devise or learn features based on the `}<code parentName="li" {...{
          "className": "language-text"
        }}>{`href`}</code>{` attribute of anchor tags or whether or not the value of the `}<code parentName="li" {...{
          "className": "language-text"
        }}>{`href`}</code>{` attribute matches the anchor tag's display text.`}</li>
      <li parentName="ul"><strong parentName="li">{`Text appears to have already been tokenized`}</strong>{`:  The text of the subject and email body appears to have already been tokenized and joined on whitespace.  This means there is no need for a sophisticated tokenizer, but also means it is a bit less straightforward to detect and normalize URLs.`}</li>
    </ul>
    <h2 {...{
      "id": "methodology",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#methodology",
        "aria-label": "methodology permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Methodology`}</h2>
    <p>{`We will examine two approaches: a classic ML model and a more recent approach that has become standard in NLP.`}</p>
    <h3 {...{
      "id": "classic",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#classic",
        "aria-label": "classic permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Classic`}</h3>
    <p>{`A simple starting point for such a problem is to use a linear model with `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-gram features (bag of `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-grams) with unigrams (individual tokens) and bigrams (pairs of tokens).  `}</p>
    <div {...{
      "className": "admonition admonition-info alert alert--info"
    }}><div parentName="div" {...{
        "className": "admonition-heading"
      }}><h5 parentName="div"><span parentName="h5" {...{
            "className": "admonition-icon"
          }}><svg parentName="span" {...{
              "xmlns": "http://www.w3.org/2000/svg",
              "width": "14",
              "height": "16",
              "viewBox": "0 0 14 16"
            }}><path parentName="svg" {...{
                "fillRule": "evenodd",
                "d": "M7 2.3c3.14 0 5.7 2.56 5.7 5.7s-2.56 5.7-5.7 5.7A5.71 5.71 0 0 1 1.3 8c0-3.14 2.56-5.7 5.7-5.7zM7 1C3.14 1 0 4.14 0 8s3.14 7 7 7 7-3.14 7-7-3.14-7-7-7zm1 3H6v5h2V4zm0 6H6v2h2v-2z"
              }}></path></svg></span>{`higher-order n-grams`}</h5></div><div parentName="div" {...{
        "className": "admonition-content"
      }}><p parentName="div">{`Token (as opposed to character) `}<span parentName="p" {...{
            "className": "math math-inline"
          }}><span parentName="span" {...{
              "className": "katex"
            }}><span parentName="span" {...{
                "className": "katex-mathml"
              }}><math parentName="span" {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML"
                }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                      "encoding": "application/x-tex"
                    }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
                "className": "katex-html",
                "aria-hidden": "true"
              }}><span parentName="span" {...{
                  "className": "base"
                }}><span parentName="span" {...{
                    "className": "strut",
                    "style": {
                      "height": "0.4306em"
                    }
                  }}></span><span parentName="span" {...{
                    "className": "mord mathnormal"
                  }}>{`n`}</span></span></span></span></span>{`-gram features are typically very sparse.  As the value of `}<span parentName="p" {...{
            "className": "math math-inline"
          }}><span parentName="span" {...{
              "className": "katex"
            }}><span parentName="span" {...{
                "className": "katex-mathml"
              }}><math parentName="span" {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML"
                }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                      "encoding": "application/x-tex"
                    }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
                "className": "katex-html",
                "aria-hidden": "true"
              }}><span parentName="span" {...{
                  "className": "base"
                }}><span parentName="span" {...{
                    "className": "strut",
                    "style": {
                      "height": "0.4306em"
                    }
                  }}></span><span parentName="span" {...{
                    "className": "mord mathnormal"
                  }}>{`n`}</span></span></span></span></span>{` increases, `}<span parentName="p" {...{
            "className": "math math-inline"
          }}><span parentName="span" {...{
              "className": "katex"
            }}><span parentName="span" {...{
                "className": "katex-mathml"
              }}><math parentName="span" {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML"
                }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                      "encoding": "application/x-tex"
                    }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
                "className": "katex-html",
                "aria-hidden": "true"
              }}><span parentName="span" {...{
                  "className": "base"
                }}><span parentName="span" {...{
                    "className": "strut",
                    "style": {
                      "height": "0.4306em"
                    }
                  }}></span><span parentName="span" {...{
                    "className": "mord mathnormal"
                  }}>{`n`}</span></span></span></span></span>{`-grams become less common. You might encounter a word (unigram) frequently but many longer phrases are wholly unique. Without imposing a minimum frequency, including rare, higher-order token `}<span parentName="p" {...{
            "className": "math math-inline"
          }}><span parentName="span" {...{
              "className": "katex"
            }}><span parentName="span" {...{
                "className": "katex-mathml"
              }}><math parentName="span" {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML"
                }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                      "encoding": "application/x-tex"
                    }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
                "className": "katex-html",
                "aria-hidden": "true"
              }}><span parentName="span" {...{
                  "className": "base"
                }}><span parentName="span" {...{
                    "className": "strut",
                    "style": {
                      "height": "0.4306em"
                    }
                  }}></span><span parentName="span" {...{
                    "className": "mord mathnormal"
                  }}>{`n`}</span></span></span></span></span>{`-grams as features usually leads to overfitting.  To keep things simple and mitigate this risk, we'll start with token unigrams and bigrams.`}</p></div></div>
    <h4 {...{
      "id": "features",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h4" {...{
        "href": "#features",
        "aria-label": "features permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Features`}</h4>
    <p>{`Rather than using raw counts of `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-grams, though, we'll instead use a measure of the `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-gram's relevance to a document.  One such measure is term frequency-inverse document frequency (TF-IDF).`}</p>
    <p><strong parentName="p">{`term frequency`}</strong>{` (TF) is measured per document.  It is simply a count of term `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`t`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`t`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6151em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span></span></span></span></span>{` within a document `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`d`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`d`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`d`}</span></span></span></span></span>{`.  Typically, we take the log of this value.  The intuition is that the more frequently a term appears in a document, the more important the term is to that document. Of course, there are exceptions.  For instance, functional words like determiners ("a", "an", "the", etc.) are common across a corpus.  Document collections may share a common template (ex. `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`Subject:`}</code>{`).  The `}<strong parentName="p">{`inverse document frequency`}</strong>{` (IDF) provides a way of downweighting such cases:`}</p>
    <div {...{
      "className": "math math-display"
    }}><span parentName="div" {...{
        "className": "katex-display"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML",
              "display": "block"
            }}><semantics parentName="math"><mrow parentName="semantics"><mtext parentName="mrow">{`idf`}</mtext><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`(`}</mo><mi parentName="mrow">{`t`}</mi><mo parentName="mrow" {...{
                    "separator": "true"
                  }}>{`,`}</mo><mi parentName="mrow">{`D`}</mi><mo parentName="mrow" {...{
                    "stretchy": "false"
                  }}>{`)`}</mo><mo parentName="mrow">{`=`}</mo><mi parentName="mrow">{`log`}</mi><mo parentName="mrow">{`⁡`}</mo><mfrac parentName="mrow"><mrow parentName="mfrac"><mi parentName="mrow" {...{
                        "mathvariant": "normal"
                      }}>{`∣`}</mi><mi parentName="mrow">{`D`}</mi><mi parentName="mrow" {...{
                        "mathvariant": "normal"
                      }}>{`∣`}</mi></mrow><mrow parentName="mfrac"><munder parentName="mrow"><mo parentName="munder">{`∑`}</mo><mrow parentName="munder"><mi parentName="mrow">{`d`}</mi><mo parentName="mrow">{`∈`}</mo><mi parentName="mrow">{`D`}</mi></mrow></munder><mrow parentName="mrow"><mo parentName="mrow" {...{
                          "fence": "true"
                        }}>{`{`}</mo><mtable parentName="mrow" {...{
                          "rowspacing": "0.36em",
                          "columnalign": "left left",
                          "columnspacing": "1em"
                        }}><mtr parentName="mtable"><mtd parentName="mtr"><mstyle parentName="mtd" {...{
                                "scriptlevel": "0",
                                "displaystyle": "false"
                              }}><mn parentName="mstyle">{`1`}</mn></mstyle></mtd><mtd parentName="mtr"><mstyle parentName="mtd" {...{
                                "scriptlevel": "0",
                                "displaystyle": "false"
                              }}><mrow parentName="mstyle"><mtext parentName="mrow">{`if`}</mtext><mspace parentName="mrow" {...{
                                    "width": "1em"
                                  }}></mspace><mi parentName="mrow">{`t`}</mi><mo parentName="mrow">{`∈`}</mo><mi parentName="mrow">{`d`}</mi></mrow></mstyle></mtd></mtr><mtr parentName="mtable"><mtd parentName="mtr"><mstyle parentName="mtd" {...{
                                "scriptlevel": "0",
                                "displaystyle": "false"
                              }}><mn parentName="mstyle">{`0`}</mn></mstyle></mtd><mtd parentName="mtr"><mstyle parentName="mtd" {...{
                                "scriptlevel": "0",
                                "displaystyle": "false"
                              }}><mtext parentName="mstyle">{`otherwise`}</mtext></mstyle></mtd></mtr></mtable></mrow></mrow></mfrac></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`\\text{idf}(t, D) = \\log\\frac{\\vert D \\vert}{
  \\sum_{d \\in D}
    \\begin{cases}
      1 & \\text{if} \\quad t \\in d \\\\ 
      0 & \\text{otherwise}
    \\end{cases}
  }`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "1em",
                  "verticalAlign": "-0.25em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord text"
              }}><span parentName="span" {...{
                  "className": "mord"
                }}>{`idf`}</span></span><span parentName="span" {...{
                "className": "mopen"
              }}>{`(`}</span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`t`}</span><span parentName="span" {...{
                "className": "mpunct"
              }}>{`,`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.02778em"
                }
              }}>{`D`}</span><span parentName="span" {...{
                "className": "mclose"
              }}>{`)`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`=`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "4.317em",
                  "verticalAlign": "-2.89em"
                }
              }}></span><span parentName="span" {...{
                "className": "mop"
              }}>{`lo`}<span parentName="span" {...{
                  "style": {
                    "marginRight": "0.01389em"
                  }
                }}>{`g`}</span></span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.1667em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord"
              }}><span parentName="span" {...{
                  "className": "mopen nulldelimiter"
                }}></span><span parentName="span" {...{
                  "className": "mfrac"
                }}><span parentName="span" {...{
                    "className": "vlist-t vlist-t2"
                  }}><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "1.427em"
                        }
                      }}><span parentName="span" {...{
                          "style": {
                            "top": "-2.11em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "3.75em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "mord"
                          }}><span parentName="span" {...{
                              "className": "mop"
                            }}><span parentName="span" {...{
                                "className": "mop op-symbol small-op",
                                "style": {
                                  "position": "relative",
                                  "top": "0em"
                                }
                              }}>{`∑`}</span><span parentName="span" {...{
                                "className": "msupsub"
                              }}><span parentName="span" {...{
                                  "className": "vlist-t vlist-t2"
                                }}><span parentName="span" {...{
                                    "className": "vlist-r"
                                  }}><span parentName="span" {...{
                                      "className": "vlist",
                                      "style": {
                                        "height": "0.1864em"
                                      }
                                    }}><span parentName="span" {...{
                                        "style": {
                                          "top": "-2.4003em",
                                          "marginLeft": "0em",
                                          "marginRight": "0.05em"
                                        }
                                      }}><span parentName="span" {...{
                                          "className": "pstrut",
                                          "style": {
                                            "height": "2.7em"
                                          }
                                        }}></span><span parentName="span" {...{
                                          "className": "sizing reset-size6 size3 mtight"
                                        }}><span parentName="span" {...{
                                            "className": "mord mtight"
                                          }}><span parentName="span" {...{
                                              "className": "mord mathnormal mtight"
                                            }}>{`d`}</span><span parentName="span" {...{
                                              "className": "mrel mtight"
                                            }}>{`∈`}</span><span parentName="span" {...{
                                              "className": "mord mathnormal mtight",
                                              "style": {
                                                "marginRight": "0.02778em"
                                              }
                                            }}>{`D`}</span></span></span></span></span><span parentName="span" {...{
                                      "className": "vlist-s"
                                    }}>{`​`}</span></span><span parentName="span" {...{
                                    "className": "vlist-r"
                                  }}><span parentName="span" {...{
                                      "className": "vlist",
                                      "style": {
                                        "height": "0.3271em"
                                      }
                                    }}><span parentName="span"></span></span></span></span></span></span><span parentName="span" {...{
                              "className": "mspace",
                              "style": {
                                "marginRight": "0.1667em"
                              }
                            }}></span><span parentName="span" {...{
                              "className": "minner"
                            }}><span parentName="span" {...{
                                "className": "mopen delimcenter",
                                "style": {
                                  "top": "0em"
                                }
                              }}><span parentName="span" {...{
                                  "className": "delimsizing size4"
                                }}>{`{`}</span></span><span parentName="span" {...{
                                "className": "mord"
                              }}><span parentName="span" {...{
                                  "className": "mtable"
                                }}><span parentName="span" {...{
                                    "className": "col-align-l"
                                  }}><span parentName="span" {...{
                                      "className": "vlist-t vlist-t2"
                                    }}><span parentName="span" {...{
                                        "className": "vlist-r"
                                      }}><span parentName="span" {...{
                                          "className": "vlist",
                                          "style": {
                                            "height": "1.69em"
                                          }
                                        }}><span parentName="span" {...{
                                            "style": {
                                              "top": "-3.69em"
                                            }
                                          }}><span parentName="span" {...{
                                              "className": "pstrut",
                                              "style": {
                                                "height": "3.008em"
                                              }
                                            }}></span><span parentName="span" {...{
                                              "className": "mord"
                                            }}><span parentName="span" {...{
                                                "className": "mord"
                                              }}>{`1`}</span></span></span><span parentName="span" {...{
                                            "style": {
                                              "top": "-2.25em"
                                            }
                                          }}><span parentName="span" {...{
                                              "className": "pstrut",
                                              "style": {
                                                "height": "3.008em"
                                              }
                                            }}></span><span parentName="span" {...{
                                              "className": "mord"
                                            }}><span parentName="span" {...{
                                                "className": "mord"
                                              }}>{`0`}</span></span></span></span><span parentName="span" {...{
                                          "className": "vlist-s"
                                        }}>{`​`}</span></span><span parentName="span" {...{
                                        "className": "vlist-r"
                                      }}><span parentName="span" {...{
                                          "className": "vlist",
                                          "style": {
                                            "height": "1.19em"
                                          }
                                        }}><span parentName="span"></span></span></span></span></span><span parentName="span" {...{
                                    "className": "arraycolsep",
                                    "style": {
                                      "width": "1em"
                                    }
                                  }}></span><span parentName="span" {...{
                                    "className": "col-align-l"
                                  }}><span parentName="span" {...{
                                      "className": "vlist-t vlist-t2"
                                    }}><span parentName="span" {...{
                                        "className": "vlist-r"
                                      }}><span parentName="span" {...{
                                          "className": "vlist",
                                          "style": {
                                            "height": "1.69em"
                                          }
                                        }}><span parentName="span" {...{
                                            "style": {
                                              "top": "-3.69em"
                                            }
                                          }}><span parentName="span" {...{
                                              "className": "pstrut",
                                              "style": {
                                                "height": "3.008em"
                                              }
                                            }}></span><span parentName="span" {...{
                                              "className": "mord"
                                            }}><span parentName="span" {...{
                                                "className": "mord text"
                                              }}><span parentName="span" {...{
                                                  "className": "mord"
                                                }}>{`if`}</span></span><span parentName="span" {...{
                                                "className": "mspace",
                                                "style": {
                                                  "marginRight": "1em"
                                                }
                                              }}></span><span parentName="span" {...{
                                                "className": "mord mathnormal"
                                              }}>{`t`}</span><span parentName="span" {...{
                                                "className": "mspace",
                                                "style": {
                                                  "marginRight": "0.2778em"
                                                }
                                              }}></span><span parentName="span" {...{
                                                "className": "mrel"
                                              }}>{`∈`}</span><span parentName="span" {...{
                                                "className": "mspace",
                                                "style": {
                                                  "marginRight": "0.2778em"
                                                }
                                              }}></span><span parentName="span" {...{
                                                "className": "mord mathnormal"
                                              }}>{`d`}</span></span></span><span parentName="span" {...{
                                            "style": {
                                              "top": "-2.25em"
                                            }
                                          }}><span parentName="span" {...{
                                              "className": "pstrut",
                                              "style": {
                                                "height": "3.008em"
                                              }
                                            }}></span><span parentName="span" {...{
                                              "className": "mord"
                                            }}><span parentName="span" {...{
                                                "className": "mord text"
                                              }}><span parentName="span" {...{
                                                  "className": "mord"
                                                }}>{`otherwise`}</span></span></span></span></span><span parentName="span" {...{
                                          "className": "vlist-s"
                                        }}>{`​`}</span></span><span parentName="span" {...{
                                        "className": "vlist-r"
                                      }}><span parentName="span" {...{
                                          "className": "vlist",
                                          "style": {
                                            "height": "1.19em"
                                          }
                                        }}><span parentName="span"></span></span></span></span></span></span></span><span parentName="span" {...{
                                "className": "mclose nulldelimiter"
                              }}></span></span></span></span><span parentName="span" {...{
                          "style": {
                            "top": "-3.98em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "3.75em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "frac-line",
                            "style": {
                              "borderBottomWidth": "0.04em"
                            }
                          }}></span></span><span parentName="span" {...{
                          "style": {
                            "top": "-4.427em"
                          }
                        }}><span parentName="span" {...{
                            "className": "pstrut",
                            "style": {
                              "height": "3.75em"
                            }
                          }}></span><span parentName="span" {...{
                            "className": "mord"
                          }}><span parentName="span" {...{
                              "className": "mord"
                            }}>{`∣`}</span><span parentName="span" {...{
                              "className": "mord mathnormal",
                              "style": {
                                "marginRight": "0.02778em"
                              }
                            }}>{`D`}</span><span parentName="span" {...{
                              "className": "mord"
                            }}>{`∣`}</span></span></span></span><span parentName="span" {...{
                        "className": "vlist-s"
                      }}>{`​`}</span></span><span parentName="span" {...{
                      "className": "vlist-r"
                    }}><span parentName="span" {...{
                        "className": "vlist",
                        "style": {
                          "height": "2.89em"
                        }
                      }}><span parentName="span"></span></span></span></span></span><span parentName="span" {...{
                  "className": "mclose nulldelimiter"
                }}></span></span></span></span></span></span></div>
    <p>{`TF-IDF is simply a product of those two scores.`}</p>
    <h4 {...{
      "id": "classification-algorithm",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h4" {...{
        "href": "#classification-algorithm",
        "aria-label": "classification algorithm permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Classification algorithm`}</h4>
    <p>{`For our linear classifier, we'll use the shallow artificial neural network consisting of a single neuron: a `}<strong parentName="p">{`binomial logistic regression`}</strong>{`.`}</p>
    <p><undefined parentName="p">{`
        `}<div {...{
          "className": "embedVideo-container"
        }}>{`
            `}<iframe parentName="div" {...{
            "title": "",
            "width": 800,
            "height": 400,
            "src": "https://www.youtube-nocookie.com/embed/GnF5MV98nlU?rel=0",
            "className": "embedVideo-iframe",
            "style": {
              "border": "0"
            },
            "loading": "eager",
            "allowFullScreen": true,
            "sandbox": "allow-same-origin allow-scripts allow-popups"
          }}></iframe>{`
        `}</div></undefined></p>
    <p>{`We'll use a standard binary cross entropy learning objective`}<sup parentName="p" {...{
        "id": "fnref-1"
      }}><a parentName="sup" {...{
          "href": "#fn-1",
          "className": "footnote-ref"
        }}>{`1`}</a></sup>{` and incorporate L2 regularization to control the magnitude of the weights learned by our classifier. This will help us to avoid relying too much on any single feature for our predictions.`}</p>
    <h3 {...{
      "id": "modern",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#modern",
        "aria-label": "modern permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Modern`}</h3>
    <p>{`With the advent of `}<a parentName="p" {...{
        "href": "/tutorials/transformers-for-nlp/#attention"
      }}>{`self-attention`}</a><undefined parentName="p">{`, transformer-based architectures and large language models have come to dominate most benchmarks in NLP.  One of the earliest successful transformer variants used in this setting was BERT`}<span {...{
          "id": "citation-0",
          "data-hover": ""
        }}><span parentName="span" {...{
            "className": "citation-number"
          }}>{`[1]`}</span></span>{`.  For our second approach, we will use a version of the BERT architecture that ignores he case of the text it processes to reduce the model's vocabulary (`}</undefined><a parentName="p" {...{
        "href": "https://huggingface.co/bert-base-uncased",
        "target": "_self",
        "rel": "nofollow"
      }}><code parentName="a" {...{
          "className": "language-text"
        }}>{`bert-base-uncased`}</code></a>{`).  We will use a pre-trained model as our foundation for our classifier.  This foundation or "backbone" has learned many useful properties of language through self-supervised learning on an unlabeled corpus of billions of words.`}<sup parentName="p" {...{
        "id": "fnref-2"
      }}><a parentName="sup" {...{
          "href": "#fn-2",
          "className": "footnote-ref"
        }}>{`2`}</a></sup>{` This model backbone feeds into a classification "head" (a shallow feedforward network with a final softmax layer).  Transformer-based models consistently demonstrate impressive performance, but they have millions (and often billions) of parameters.  Our case-folded version of the base BERT model has 109,483,778 trainable parameters. Rather than learn to adjust all of these parameters, we will instead fix or "freeze" the weights of the first eight layers of the encoder along with its input embeddings.  This improves training efficiency by reducing our trainable parameters to roughly 29M.  Coupled with a low learning rate, parameter freezing helps us avoid quickly overfitting and forgetting useful general-purpose information learned in the pre-training tasks. As with our prior model, we'll use a cross entropy loss as our learning objective.  `}</p>
    <p>{`Unlike our classic approach, our BERT-based classifier will not use any engineered features.  Instead, we'll rely on the network to learn useful representations for our task.  Here we will simply encode each email as follows: `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`[CLS] <subject> [SEP] <body> [SEP]`}</code>{`.  BERT's contextual wordpiece embeddings and attention heads will derive further features. `}</p>
    {
      /* :::note why not start with a SotA model?
      With the advent of [self-attention](/tutorials/transformers-for-nlp/#attention), transformer-based architectures have come to dominate most benchmarks in NLP.  Though their performance is consistently impressive, they have millions (and often billions) of parameters.  Can a transformer-based model be used to solve this problem?  Yes, but it would be overkill and less efficient. A problem like this is easily addressed using classic techniques without the need of specialized hardware for accelerating training and inference.
      ::: */
    }
    <h2 {...{
      "id": "implementation",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#implementation",
        "aria-label": "implementation permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Implementation`}</h2>
    <p><a parentName="p" {...{
        "href": "https://github.com/myedibleenso/logikcull-ml-engineer-hw",
        "target": "_self",
        "rel": "nofollow"
      }}>{`The project`}</a>{` follows a `}<a parentName="p" {...{
        "href": "https://github.com/clu-ling/clu-template",
        "target": "_self",
        "rel": "nofollow"
      }}><code parentName="a" {...{
          "className": "language-text"
        }}>{`cookiecutter`}</code>{` template`}</a>{` that is organized as a small Python library.  `}<a parentName="p" {...{
        "href": "https://github.com/myedibleenso/logikcull-ml-engineer-hw",
        "target": "_self",
        "rel": "nofollow"
      }}>{`Installation instructions are provided in the project README`}</a>{`.  `}</p>
    <h3 {...{
      "id": "training-and-evaluating-our-classifier",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#training-and-evaluating-our-classifier",
        "aria-label": "training and evaluating our classifier permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Training and evaluating our classifier`}</h3>
    <p><a parentName="p" {...{
        "href": "https://github.com/myedibleenso/logikcull-ml-engineer-hw",
        "target": "_self",
        "rel": "nofollow"
      }}>{`The library`}</a>{` can be used to train and evaluate logistic regression and transformer-based models:`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` logikcull`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`ml`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`toy`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`dataset `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`*`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` logikcull`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`ml`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`toy`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`simple `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`*`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` sklearn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`metrics `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` classification_report
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` numpy `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`as`}</span>{` np

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# load our dataset of emails`}</span>{`
ds`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` SimpleDataset `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` EmailDatasetUtils`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`load_from_dir`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"data"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# create a stratified split of the dataset (80% training and 20% for evaluation)`}</span>{`
ds_train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` ds_test `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` ds`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`split`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`training_size`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`.8`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# initialize a logistic regression classifier`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# that uses TF-IDF scores of token n-grams as its features`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# and L2 regularization `}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# (i.e., controls feature magnitudes w/o eliminating features)`}</span>{`
pipeline `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` LogisticRegressionSpamClassifierPipeline`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`random_state`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`42`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# extract features and train our classifier`}</span>{`
pipeline`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`fit`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`ds_train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`X`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` ds_train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`y`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# view the top features associated with the "spam" class`}</span>{`
features_df `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` pipeline`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`top_features`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`k`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`30`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
features_df`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`head`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# evaluate on held-out 20% of dataset`}</span>{`
preds `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` pipeline`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`predict`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`pipeline`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`transform`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`ds_test`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`X`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
report `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` classification_report`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`
  y_true`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`ds_test`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`y`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  y_pred`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`preds`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  target_names`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`ds_test`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`classes`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  output_dict`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`False`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`print`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`report`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "text"
    }}><pre parentName="div" {...{
        "className": "language-text"
      }}><code parentName="pre" {...{
          "className": "language-text"
        }}>{`              precision    recall  f1-score   support

         ham       0.99      1.00      0.99       759
        spam       0.99      0.98      0.98       276

    accuracy                           0.99      1035
   macro avg       0.99      0.99      0.99      1035
weighted avg       0.99      0.99      0.99      1035`}</code></pre></div>
    <p>{`... to train and evaluate a transformer-based model:`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "python"
    }}><pre parentName="div" {...{
        "className": "language-python"
      }}><code parentName="pre" {...{
          "className": "language-python"
        }}><span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` logikcull`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`ml`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`toy`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`metrics `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` MetricsUtils
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` logikcull`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`ml`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`toy`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`dataset `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`*`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` logikcull`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`ml`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`toy`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`transformer `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`*`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`from`}</span>{` sklearn`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`metrics `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` classification_report
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`import`}</span>{` numpy `}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`as`}</span>{` np

`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# path to our data folder`}</span>{`
data_dir `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"data"`}</span>{`
backbone_name `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"bert-base-uncased"`}</span>{`
_clf `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` BertSpamClassifier`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`backbone_name`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`backbone_name`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` num_labels`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`2`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# load and preprocess our dataset of emails as a Hugging Face Dataset`}</span>{`
ds `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` EmailDatasetUtils`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`create_hf_dataset`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`encoder_fn`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`_clf`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`encode_datapoint`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` data_dir`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`data_dir`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# create a stratified split of the dataset (80% training and 20% for evaluation)`}</span>{`
_ds_train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` ds_test `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` ds`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`split`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`training_size`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`.8`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# ... hold out a portion (10%) of our training data for validation`}</span>{`
ds_train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` ds_validation `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` _ds_train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`split`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`training_size`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`.9`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# train (lots of moving parts here)`}</span>{`
clf`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` _`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` _ `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`
  clf`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`_clf`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
  train_ds`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`ds_train`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  validation_ds`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`ds_validation`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  test_ds`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`ds_test`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
  out_dir`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"results"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  log_dir`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"results/logs"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  run_name`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token string-interpolation"
          }}><span parentName="span" {...{
              "className": "token string"
            }}>{`f"logikcull-email-classifier-`}</span><span parentName="span" {...{
              "className": "token interpolation"
            }}><span parentName="span" {...{
                "className": "token punctuation"
              }}>{`{`}</span>{`backbone_name`}<span parentName="span" {...{
                "className": "token punctuation"
              }}>{`}`}</span></span><span parentName="span" {...{
              "className": "token string"
            }}>{`"`}</span></span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  compute_metrics`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`MetricsUtils`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`compute_hf_metrics`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  epochs`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`3`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` 
  seed`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token number"
          }}>{`42`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  freeze_first_n`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` `}<span parentName="code" {...{
            "className": "token number"
          }}>{`8`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  freeze_embeddings`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`True`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token comment"
          }}>{`# evaluate on held-out 20% of dataset`}</span>{`
preds`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`:`}</span>{` np`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`ndarray `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` clf`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`predict_for_ds`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`ds_test`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
report `}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{` classification_report`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`
  y_true`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`ds_test`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"labels"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`.`}</span>{`numpy`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
  y_pred`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span>{`preds`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  target_names`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`[`}</span><span parentName="code" {...{
            "className": "token string"
          }}>{`"ham"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{` `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"spam"`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`]`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`,`}</span>{`
  output_dict`}<span parentName="code" {...{
            "className": "token operator"
          }}>{`=`}</span><span parentName="code" {...{
            "className": "token boolean"
          }}>{`False`}</span>{`
`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span>{`
`}<span parentName="code" {...{
            "className": "token keyword"
          }}>{`print`}</span><span parentName="code" {...{
            "className": "token punctuation"
          }}>{`(`}</span>{`report`}<span parentName="code" {...{
            "className": "token punctuation"
          }}>{`)`}</span></code></pre></div>
    <p>{`The results reported in the following section use an included runnable to perform stratified `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`k`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`k`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03148em"
                }
              }}>{`k`}</span></span></span></span></span>{`-fold cross validation.`}</p>
    <p>{`Train and evaluate a logistic regression classifier:`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "bash"
    }}><pre parentName="div" {...{
        "className": "language-bash"
      }}><code parentName="pre" {...{
          "className": "language-bash"
        }}>{`logikcull-evaluate --input `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"data"`}</span>{` -k `}<span parentName="code" {...{
            "className": "token number"
          }}>{`5`}</span>{` --model `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"lr"`}</span></code></pre></div>
    <p>{`Train and evaluate a bert-based classifier:`}</p>
    <div {...{
      "className": "gatsby-highlight",
      "data-language": "bash"
    }}><pre parentName="div" {...{
        "className": "language-bash"
      }}><code parentName="pre" {...{
          "className": "language-bash"
        }}>{`logikcull-evaluate -k `}<span parentName="code" {...{
            "className": "token number"
          }}>{`5`}</span>{` --seeds `}<span parentName="code" {...{
            "className": "token number"
          }}>{`1`}</span>{` --model `}<span parentName="code" {...{
            "className": "token string"
          }}>{`"transformer"`}</span>{` --input data --output results`}</code></pre></div>
    <h2 {...{
      "id": "results",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#results",
        "aria-label": "results permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Results`}</h2>
    <p>{`To measure how sensitive our approaches are to different data splits, we'll report results using statified 5-fold cross validation.  Each data split or `}<em parentName="p">{`fold`}</em>{` preserves the class ratio of the overall dataset (roughly 3:1).  We train and evaluate 5 models. Each of these `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`k`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`k`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03148em"
                }
              }}>{`k`}</span></span></span></span></span>{` (`}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`k`}</mi><mo parentName="mrow">{`=`}</mo><mn parentName="mrow">{`5`}</mn></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`k=5`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6944em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03148em"
                }
              }}>{`k`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span><span parentName="span" {...{
                "className": "mrel"
              }}>{`=`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2778em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6444em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord"
              }}>{`5`}</span></span></span></span></span>{`) models is trained on a distinct set of `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`k`}</mi><mo parentName="mrow">{`−`}</mo><mn parentName="mrow">{`1`}</mn></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`k-1`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.7778em",
                  "verticalAlign": "-0.0833em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal",
                "style": {
                  "marginRight": "0.03148em"
                }
              }}>{`k`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2222em"
                }
              }}></span><span parentName="span" {...{
                "className": "mbin"
              }}>{`−`}</span><span parentName="span" {...{
                "className": "mspace",
                "style": {
                  "marginRight": "0.2222em"
                }
              }}></span></span><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.6444em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord"
              }}>{`1`}</span></span></span></span></span>{` folds and tested on the remaining fold.  Finally, the precision, recall, and F1 scores of each model are averaged (see the table below).`}</p>
    <HTMLTable condensed striped mdxType="HTMLTable">
  <caption>Performance of our two models for the SPAM class.  Numbers are an average of results on stratified 5-fold cross validation.</caption>
  <thead>
    <tr>
        <td><strong>Model</strong></td>
        <td><strong>Precision</strong></td>
        <td><strong>Recall</strong></td>
        <td><strong>F1</strong></td>
        <td><strong>Training data size</strong></td>
        <td><strong>Support (train)</strong></td>
        <td><strong>Support (eval)</strong></td>
    </tr>
  </thead>
  <tbody>
        <tr>
          <td rowSpan="1" align="center">
            Logistic Regression (TF-IDF for unigrams and bigrams)
          </td>
          <td>
            98
          </td>
          <td>
            97.7
          </td>
          <td>
            97.8
          </td>
          <td>
            1200
          </td>
          <td>
            4138
          </td>
          <td>
            300
          </td>
        </tr>
        <tr>
          <td rowSpan="1" align="center">
            BERT (base-uncased with the first 8 layers and embeddings frozen)
          </td>
          <td>
            99.93
          </td>
          <td>
            100
          </td>
          <td>
            99.97
          </td>
          <td>
            1080
          </td>
          <td>
            3723
          </td>
          <td>
            300
          </td>
        </tr>
    </tbody>
    </HTMLTable>
    <p>{`For our BERT-based classifier, we train each fold for a maximum of three epochs`}<sup parentName="p" {...{
        "id": "fnref-3"
      }}><a parentName="sup" {...{
          "href": "#fn-3",
          "className": "footnote-ref"
        }}>{`3`}</a></sup>{` with early stopping`}<sup parentName="p" {...{
        "id": "fnref-4"
      }}><a parentName="sup" {...{
          "href": "#fn-4",
          "className": "footnote-ref"
        }}>{`4`}</a></sup>{` to avoid useless computation and help curb overfitting.`}<sup parentName="p" {...{
        "id": "fnref-5"
      }}><a parentName="sup" {...{
          "href": "#fn-5",
          "className": "footnote-ref"
        }}>{`5`}</a></sup>{`  `}</p>
    <p>{`The following box plots shows how precision, recall, and F1 scores varies across our stratified cross validation folds for our two models (top: logistic regression; bottom: BERT-based model).`}</p>
    <p><img parentName="p" {...{
        "src": "/48851b7c82fc159de07d2a7850908363/spam-boxplot.svg",
        "alt": "\"box plot of CV scores\"",
        "title": "Box plot of precision, recall, and F1 scores for our logistic regression classifier across 5 stratified cross validation folds."
      }}></img></p>
    <p><img parentName="p" {...{
        "src": "/5c0acb2f04149e4d8e836d811326402b/spam-boxplot-transformer-bert-base-uncased.svg",
        "alt": "\"box plot of CV scores\"",
        "title": "Box plot of precision, recall, and F1 scores for our BERT-based classifier across 5 stratified cross validation folds."
      }}></img></p>
    <div {...{
      "className": "admonition admonition-warning alert alert--danger"
    }}><div parentName="div" {...{
        "className": "admonition-heading"
      }}><h5 parentName="div"><span parentName="h5" {...{
            "className": "admonition-icon"
          }}><svg parentName="span" {...{
              "xmlns": "http://www.w3.org/2000/svg",
              "width": "12",
              "height": "16",
              "viewBox": "0 0 12 16"
            }}><path parentName="svg" {...{
                "fillRule": "evenodd",
                "d": "M5.05.31c.81 2.17.41 3.38-.52 4.31C3.55 5.67 1.98 6.45.9 7.98c-1.45 2.05-1.7 6.53 3.53 7.7-2.2-1.16-2.67-4.52-.3-6.61-.61 2.03.53 3.33 1.94 2.86 1.39-.47 2.3.53 2.27 1.67-.02.78-.31 1.44-1.13 1.81 3.42-.59 4.78-3.42 4.78-5.56 0-2.84-2.53-3.22-1.25-5.61-1.52.13-2.03 1.13-1.89 2.75.09 1.08-1.02 1.8-1.86 1.33-.67-.41-.66-1.19-.06-1.78C8.18 5.31 8.68 2.45 5.05.32L5.03.3l.02.01z"
              }}></path></svg></span>{`Note`}</h5></div><div parentName="div" {...{
        "className": "admonition-content"
      }}><p parentName="div">{`Deep neural networks are sensitive to the random seed used for initializing weights`}<sup parentName="p" {...{
            "id": "fnref-6"
          }}><a parentName="sup" {...{
              "href": "#fn-6",
              "className": "footnote-ref"
            }}>{`6`}</a></sup>{`, but the results reported here use only a single random seed.`}</p></div></div>
    <p>{`Our BERT-based classifier shows far less variation from fold to fold`}<sup parentName="p" {...{
        "id": "fnref-7"
      }}><a parentName="sup" {...{
          "href": "#fn-7",
          "className": "footnote-ref"
        }}>{`7`}</a></sup>{`, but both models show performance consistently in the upper 90s.`}</p>
    <h3 {...{
      "id": "learned-features",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h3" {...{
        "href": "#learned-features",
        "aria-label": "learned features permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Learned features`}</h3>
    <p>{`Logistic regression learns a weight for each feature.  Positive-valued features are correlated with our class of interested (`}<strong parentName="p">{`spam`}</strong>{` in this case).  The greater the magnitude of a weight, the larger the role it plays in the classifier's decision.  Inspecting these weights gives us some insight into the decision process. The table below shows the top 20 features associated with the `}<strong parentName="p">{`spam`}</strong>{` class.`}</p>
    <HTMLTable condensed striped mdxType="HTMLTable">
  <caption>Top 20 features associated with the <bold>spam</bold> class. Presented in descending order using an 80:20 train-test split of the dataset.</caption>
  <thead>
    <tr>
      <th><strong>feature</strong></th>
      <th><strong>weight</strong></th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td className="feature">TF-IDF(http)</td>
      <td>3.450437</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(subject)</td>
      <td>2.461667</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(money)</td>
      <td>1.914449</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(online)</td>
      <td>1.909308</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(2004)</td>
      <td>1.789275</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(click)</td>
      <td>1.552477</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(best)</td>
      <td>1.551937</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(www)</td>
      <td>1.543136</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(pain)</td>
      <td>1.497427</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(meds)</td>
      <td>1.471923</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(http www)</td>
      <td>1.390868</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(paliourg)</td>
      <td>1.357542</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(free)</td>
      <td>1.326871</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(low)</td>
      <td>1.310563</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(account)</td>
      <td>1.295502</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(email)</td>
      <td>1.285660</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(life)</td>
      <td>1.281720</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(remove)</td>
      <td>1.275990</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(prices)</td>
      <td>1.268817</td>
    </tr>
    <tr>
      <td className="feature">TF-IDF(international)</td>
      <td>1.246361</td>
    </tr>
  </tbody>
    </HTMLTable>
    <p>{`Note that the features reported here model how important the `}<strong parentName="p">{`relevance`}</strong>{` of a particular `}<span parentName="p" {...{
        "className": "math math-inline"
      }}><span parentName="span" {...{
          "className": "katex"
        }}><span parentName="span" {...{
            "className": "katex-mathml"
          }}><math parentName="span" {...{
              "xmlns": "http://www.w3.org/1998/Math/MathML"
            }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                  "encoding": "application/x-tex"
                }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
            "className": "katex-html",
            "aria-hidden": "true"
          }}><span parentName="span" {...{
              "className": "base"
            }}><span parentName="span" {...{
                "className": "strut",
                "style": {
                  "height": "0.4306em"
                }
              }}></span><span parentName="span" {...{
                "className": "mord mathnormal"
              }}>{`n`}</span></span></span></span></span>{`-gram is to determining whether or not an email is spam.  For instance, documents where terms like `}<em parentName="p">{`free`}</em>{`, `}<em parentName="p">{`click`}</em>{`, and `}<em parentName="p">{`money`}</em>{` are highly relevant are strongly associated with spam. Top features also reveal potential issues with generalization.  For instance, while the relevance of URLs in a document are strongly associated with spam in our dataset (i.e., docs that revolve around URLs are likely to be spam), this feature may be unreliable beyond this dataset (ex. emails that contain a pointer to a work doc). Substituting domains in place of URLs (ex. `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`https://google.com`}</code>{` `}{`→`}{` `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`URL_GOOGLE`}</code>{`) would make for a more robust feature, especially when considered alongside user metadata.  Similarly, `}<code parentName="p" {...{
        "className": "language-text"
      }}>{`subject`}</code>{` (in its `}<em parentName="p">{`lowercase`}</em>{` form) is more commonly found in the body of spam emails (58 vs 133 docs), but this may not hold true in the all settings (ex. the inbox of Melvil "The Decimal" Dewey, long threads with tech support, etc.). A real world model would need to be monitored and periodically retrained on new data to handle data drift.`}</p>
    {
      /* ### Error analysis */
    }
    <h2 {...{
      "id": "improvements",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#improvements",
        "aria-label": "improvements permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Improvements`}</h2>
    <div {...{
      "className": "admonition admonition-info alert alert--info"
    }}><div parentName="div" {...{
        "className": "admonition-heading"
      }}><h5 parentName="div"><span parentName="h5" {...{
            "className": "admonition-icon"
          }}><svg parentName="span" {...{
              "xmlns": "http://www.w3.org/2000/svg",
              "width": "14",
              "height": "16",
              "viewBox": "0 0 14 16"
            }}><path parentName="svg" {...{
                "fillRule": "evenodd",
                "d": "M7 2.3c3.14 0 5.7 2.56 5.7 5.7s-2.56 5.7-5.7 5.7A5.71 5.71 0 0 1 1.3 8c0-3.14 2.56-5.7 5.7-5.7zM7 1C3.14 1 0 4.14 0 8s3.14 7 7 7 7-3.14 7-7-3.14-7-7-7zm1 3H6v5h2V4zm0 6H6v2h2v-2z"
              }}></path></svg></span>{`Note`}</h5></div><div parentName="div" {...{
        "className": "admonition-content"
      }}><p parentName="div">{`Since our BERT-based classifier has reached a performance ceiling on this dataset, the improvements in this section largely focus on our logistic regression classifier.  Improvements to our transformer-based model would probably center around efficiency (ex. more compact backbones, model compression via quantization, etc).`}</p></div></div>
    <p>{`While performance is already quite strong on our toy dataset, there are a number of ways we could improve our model to handle more difficult cases of spam in the wild.`}</p>
    <ul>
      <li parentName="ul">
        <p parentName="li"><strong parentName="p">{`Normalize text`}</strong>{`: Normalizing mentions of prices (ex. "only $ 197 !" -> "only DOLLAR !") and replacing URLs with their domain name may help to uncover useful patterns in the data by treating semantically related mentions more uniformly.`}</p>
      </li>
      <li parentName="ul">
        <p parentName="li"><strong parentName="p">{`Section-aware features`}</strong><undefined parentName="p">{`: The current logistic regression model derives undifferentiated features from the contents of the each email.  Following earlier work on domain adaptation`}<span {...{
              "id": "citation-0",
              "data-hover": ""
            }}><span parentName="span" {...{
                "className": "citation-number"
              }}>{`[2]`}</span></span>{`, in addition to an undifferentiated treatment, we can model features derived from the email's subject distinctly from those derived from the contents of the body (ex. `}</undefined><code parentName="p" {...{
            "className": "language-text"
          }}>{`subject_contains("!")`}</code>{`, `}<code parentName="p" {...{
            "className": "language-text"
          }}>{`body_contains("!")`}</code>{`, `}<code parentName="p" {...{
            "className": "language-text"
          }}>{`email_contains("!")`}</code>{`).  This allows us to reuse existing features to uncover telling patterns.  For `}<span parentName="p" {...{
            "className": "math math-inline"
          }}><span parentName="span" {...{
              "className": "katex"
            }}><span parentName="span" {...{
                "className": "katex-mathml"
              }}><math parentName="span" {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML"
                }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                      "encoding": "application/x-tex"
                    }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
                "className": "katex-html",
                "aria-hidden": "true"
              }}><span parentName="span" {...{
                  "className": "base"
                }}><span parentName="span" {...{
                    "className": "strut",
                    "style": {
                      "height": "0.4306em"
                    }
                  }}></span><span parentName="span" {...{
                    "className": "mord mathnormal"
                  }}>{`n`}</span></span></span></span></span>{`-gram features, this adaptation can be handled simply by prefixing each token with a special symbol (ex. `}<code parentName="p" {...{
            "className": "language-text"
          }}>{`"Subject : buy now!"`}</code>{` -> `}<code parentName="p" {...{
            "className": "language-text"
          }}>{`"SUBJ_Subject SUBJ_: SUBJ_buy SUBJ_now"`}</code>{`).`}</p>
      </li>
      <li parentName="ul">
        <p parentName="li"><strong parentName="p">{`Feature combinations`}</strong>{`: Logistic regression is a linear model that does not explicitly learn to combine features. One option is to use a non-linear classifier (ex. MLP, decision tree, CNN, etc.).  Alternatively, we can augment our features with pair-wise combinations and use a strong L1 regularization term to eliminate uninformative features.`}</p>
      </li>
      <li parentName="ul">
        <p parentName="li"><strong parentName="p">{`Introduce `}<code parentName="strong" {...{
              "className": "language-text"
            }}>{`UNK`}</code>{` for unknown tokens`}</strong>{`: Our logistic regression model will simply ignore unknown terms enountered at inference time.  We can learn to model unknown terms by preprocessing our training data.  When building the vocabulary, we can impose a minimum threshold on the number of documents an `}<span parentName="p" {...{
            "className": "math math-inline"
          }}><span parentName="span" {...{
              "className": "katex"
            }}><span parentName="span" {...{
                "className": "katex-mathml"
              }}><math parentName="span" {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML"
                }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                      "encoding": "application/x-tex"
                    }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
                "className": "katex-html",
                "aria-hidden": "true"
              }}><span parentName="span" {...{
                  "className": "base"
                }}><span parentName="span" {...{
                    "className": "strut",
                    "style": {
                      "height": "0.4306em"
                    }
                  }}></span><span parentName="span" {...{
                    "className": "mord mathnormal"
                  }}>{`n`}</span></span></span></span></span>{`-gram must be encountered in to be included in our vocabulary.  Words (unigrams) that fall below this threshold can be replaced with a special token (ex. `}<code parentName="p" {...{
            "className": "language-text"
          }}>{`UNK`}</code>{`).  Unknown tokens encountered at inference time will then be represented using the `}<code parentName="p" {...{
            "className": "language-text"
          }}>{`UNK`}</code>{` feature. Docs with high counts of `}<code parentName="p" {...{
            "className": "language-text"
          }}>{`UNK`}</code>{` are likely to be spam (gibberish).`}</p>
      </li>
      <li parentName="ul">
        <p parentName="li"><strong parentName="p">{`Character-level features`}</strong>{`: Emails with many tokens that repeat the same letter 3 or more times (ex. "haaaard") are likely to be spam. Spam emails also often break up real words with spaces or punctuation to circumvent filters.  This information can be captured by modeling character `}<span parentName="p" {...{
            "className": "math math-inline"
          }}><span parentName="span" {...{
              "className": "katex"
            }}><span parentName="span" {...{
                "className": "katex-mathml"
              }}><math parentName="span" {...{
                  "xmlns": "http://www.w3.org/1998/Math/MathML"
                }}><semantics parentName="math"><mrow parentName="semantics"><mi parentName="mrow">{`n`}</mi></mrow><annotation parentName="semantics" {...{
                      "encoding": "application/x-tex"
                    }}>{`n`}</annotation></semantics></math></span><span parentName="span" {...{
                "className": "katex-html",
                "aria-hidden": "true"
              }}><span parentName="span" {...{
                  "className": "base"
                }}><span parentName="span" {...{
                    "className": "strut",
                    "style": {
                      "height": "0.4306em"
                    }
                  }}></span><span parentName="span" {...{
                    "className": "mord mathnormal"
                  }}>{`n`}</span></span></span></span></span>{`-grams or engineering such features.`}</p>
      </li>
      <li parentName="ul">
        <p parentName="li"><strong parentName="p">{`Profanity and sentiment`}</strong>{`: Models trained to detect profanity (or even sentiment) might provide useful and complementary signal in an ensemble (or serve as a starting point for fine-tuning a transformer-based model).`}</p>
      </li>
      <li parentName="ul">
        <p parentName="li"><strong parentName="p">{`Improbable token sequences are probably spam`}</strong>{`: Emails with high perpexity or long sequences of nonce/unknown tokens are likely to be spam (ex. "catenate sneermullion conscientious damanonymous").`}</p>
      </li>
    </ul>
    <h1 {...{
      "id": "conclusions",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#conclusions",
        "aria-label": "conclusions permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Conclusions`}</h1>
    <p>{`Selecting the "right" model will always depend on the use case and constraints.  `}</p>
    <p>{`In terms of raw performance, the BERT-based classifier has a slight edge on this dataset and is likely to better generalize to unseen data.  For this toy dataset, though, it is akin to using a cannon to kill a fly. In contrast, our logistic regression classifier trains in seconds, provides a higher degree of control, and requires no specialized hardware for training or inference.  `}</p>
    <h1 {...{
      "id": "references",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h1" {...{
        "href": "#references",
        "aria-label": "references permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`References`}</h1>
    <div><ol parentName="div"><li parentName="ol"><b parentName="li">{`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`}</b>{` `}<br parentName="li"></br>{`Devlin, J., Chang, M., Lee, K. and Toutanova, K., 2018.  `}<a parentName="li" {...{
            "href": "https://doi.org/10.48550/arXiv.1810.04805",
            "style": {
              "textDecoration": "inherit"
            }
          }}>{`DOI: 10.48550/arXiv.1810.04805`}</a></li><li parentName="ol"><b parentName="li">{`Frustratingly Easy Domain Adaptation`}</b>{` `}<br parentName="li"></br>{`Daume III, H., 2007. , pp. 256--263. Association for Computational Linguistics. `}<a parentName="li" {...{
            "href": "https://doi.org/10.48550/arXiv.0907.1815",
            "style": {
              "textDecoration": "inherit"
            }
          }}>{`DOI: 10.48550/arXiv.0907.1815`}</a></li></ol></div>
    <h2 {...{
      "id": "footnotes",
      "style": {
        "position": "relative"
      }
    }}><a parentName="h2" {...{
        "href": "#footnotes",
        "aria-label": "footnotes permalink",
        "className": "md-header before"
      }}><svg parentName="a" {...{
          "aria-hidden": "true",
          "height": "20",
          "version": "1.1",
          "viewBox": "0 0 16 16",
          "width": "20"
        }}><path parentName="svg" {...{
            "fillRule": "evenodd",
            "d": "M4 9h1v1H4c-1.5 0-3-1.69-3-3.5S2.55 3 4 3h4c1.45 0 3 1.69 3 3.5 0 1.41-.91 2.72-2 3.25V8.59c.58-.45 1-1.27 1-2.09C10 5.22 8.98 4 8 4H4c-.98 0-2 1.22-2 2.5S3 9 4 9zm9-3h-1v1h1c1 0 2 1.22 2 2.5S13.98 12 13 12H9c-.98 0-2-1.22-2-2.5 0-.83.42-1.64 1-2.09V6.25c-1.09.53-2 1.84-2 3.25C6 11.31 7.55 13 9 13h4c1.45 0 3-1.69 3-3.5S14.5 6 13 6z"
          }}></path></svg></a>{`Footnotes`}</h2>

    <div {...{
      "className": "footnotes"
    }}>
      <hr parentName="div"></hr>
      <ol parentName="div">
        <li parentName="ol" {...{
          "id": "fn-1"
        }}>{`For an introduction to cross entropy, `}<a parentName="li" {...{
            "href": "https://www.youtube.com/watch?v=mvUg_7LG4RE&t=5m6s",
            "target": "_self",
            "rel": "nofollow"
          }}>{`see this clip`}</a><a parentName="li" {...{
            "href": "#fnref-1",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-2"
        }}>{`BERT's pre-training involves two tasks: The first is learning to predict what words co-occur with one another (masked langauge modeling) and the second is about determining which pairs of sentences follow one another (next sentence prediction).  The corpus used for pretraining consists of billions of words (the English Book Corpus and a subset of the English Wikipedia).  In masked language modeling (MLM), a percentage of words (~15%) are randomly hidden or "masked."  Much like a Cloze test, the model must learn to fill in these blanks using clues from the surrounding context (a position-aware enhancement of the continuous bag of words).  In next sentence prediction (NSP), pairs of sentences are randomly sampled from the corpus and the model must learn to differtiate true pairs from false ones (i.e., sentences that do not follow one another in the original text).`}<a parentName="li" {...{
            "href": "#fnref-2",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-3"
        }}>{`a complete pass over the training data`}<a parentName="li" {...{
            "href": "#fnref-3",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-4"
        }}>{`We monitor performance on a portion of held out data.  Once performance begins to deteriorate or cease changing on this held out data, we stop training.  `}<a parentName="li" {...{
            "href": "#fnref-4",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-5"
        }}>{`We fine-tuned BERT on a Macbook Pro using PyTorch's integration with the `}<a parentName="li" {...{
            "href": "https://pytorch.org/docs/stable/notes/mps.html#mps-backend",
            "target": "_self",
            "rel": "nofollow"
          }}>{`Metal Performance Shaders (MPS) backend`}</a>{` (`}<code parentName="li" {...{
            "className": "language-text"
          }}>{`mps`}</code>{` available since `}<a parentName="li" {...{
            "href": "https://developer.apple.com/metal/pytorch/",
            "target": "_self",
            "rel": "nofollow"
          }}>{`PyTorch v1.12`}</a>{`).  With hardware acceleration, 5-fold cross validation took about an hour to train.`}<a parentName="li" {...{
            "href": "#fnref-5",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-6"
        }}>{`In the case of our pre-trained BERT-based model, the initialized weights are the classification head.`}<a parentName="li" {...{
            "href": "#fnref-6",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
        <li parentName="ol" {...{
          "id": "fn-7"
        }}>{`This result is not surprising given that our BERT-based model is a nonlinear model with 3 orders of magnitude more trainable parameters.`}<a parentName="li" {...{
            "href": "#fnref-7",
            "className": "footnote-backref"
          }}>{`↩`}</a></li>
      </ol>
    </div>
    </MDXLayout>;
}
;
MDXContent.isMDXComponent = true;
      