last year's lecture material moved to directory 2025
94
lectures/2025/lecture-n-1/.gitignore
vendored
Normal file
@@ -0,0 +1,94 @@
|
||||
## Core latex/pdflatex auxiliary files:
|
||||
*.aux
|
||||
*.lof
|
||||
*.log
|
||||
*.lot
|
||||
*.fls
|
||||
*.out
|
||||
*.toc
|
||||
|
||||
## Intermediate documents:
|
||||
*.dvi
|
||||
# these rules might exclude image files for figures etc.
|
||||
# *.ps
|
||||
# *.eps
|
||||
# *.pdf
|
||||
|
||||
## Bibliography auxiliary files (bibtex/biblatex/biber):
|
||||
*.bbl
|
||||
*.bcf
|
||||
*.blg
|
||||
*-blx.aux
|
||||
*-blx.bib
|
||||
*.run.xml
|
||||
|
||||
## Build tool auxiliary files:
|
||||
*.fdb_latexmk
|
||||
*.synctex.gz
|
||||
*.synctex.gz(busy)
|
||||
*.pdfsync
|
||||
|
||||
## Auxiliary and intermediate files from other packages:
|
||||
|
||||
# algorithms
|
||||
*.alg
|
||||
*.loa
|
||||
|
||||
# amsthm
|
||||
*.thm
|
||||
|
||||
# beamer
|
||||
*.nav
|
||||
*.snm
|
||||
*.vrb
|
||||
|
||||
# glossaries
|
||||
*.acn
|
||||
*.acr
|
||||
*.glg
|
||||
*.glo
|
||||
*.gls
|
||||
|
||||
# hyperref
|
||||
*.brf
|
||||
|
||||
# listings
|
||||
*.lol
|
||||
|
||||
# makeidx
|
||||
*.idx
|
||||
*.ilg
|
||||
*.ind
|
||||
*.ist
|
||||
|
||||
# minitoc
|
||||
*.maf
|
||||
*.mtc
|
||||
*.mtc0
|
||||
|
||||
# minted
|
||||
*.pyg
|
||||
|
||||
# nomencl
|
||||
*.nlo
|
||||
|
||||
# sagetex
|
||||
*.sagetex.sage
|
||||
*.sagetex.py
|
||||
*.sagetex.scmd
|
||||
|
||||
# sympy
|
||||
*.sout
|
||||
*.sympy
|
||||
sympy-plots-for-*.tex/
|
||||
|
||||
# todonotes
|
||||
*.tdo
|
||||
|
||||
# xindy
|
||||
*.xdy
|
||||
|
||||
# useless files
|
||||
color_scheme.png
|
||||
identicon.png
|
||||
._wordcount_selection.tex
|
||||
187
lectures/2025/lecture-n-1/beamerthemelucid.sty
Normal file
@@ -0,0 +1,187 @@
|
||||
\usepackage{tikz}
|
||||
\usetikzlibrary{calc}
|
||||
|
||||
% -------- COLOR SCHEME --------
|
||||
\definecolor{PrimaryColor}{RGB}{7,79,140} % primary color (blue)
|
||||
\definecolor{SecondaryColor}{RGB}{242,88,26} % bulleted lists
|
||||
\definecolor{BackgroundColor}{RGB}{255,255,255} % background & titles (white)
|
||||
\definecolor{TextColor}{RGB}{0,0,0} % text (black)
|
||||
\definecolor{ProgBarBGColor}{RGB}{175,175,175} % progress bar background (grey)
|
||||
|
||||
|
||||
% set colours
|
||||
\setbeamercolor{normal text}{fg=TextColor}\usebeamercolor*{normal text}
|
||||
\setbeamercolor{alerted text}{fg=PrimaryColor}
|
||||
\setbeamercolor{section in toc}{fg=PrimaryColor}
|
||||
\setbeamercolor{structure}{fg=SecondaryColor}
|
||||
\hypersetup{colorlinks,linkcolor=,urlcolor=SecondaryColor}
|
||||
|
||||
% set fonts
|
||||
\setbeamerfont{itemize/enumerate body}{size=\large}
|
||||
\setbeamerfont{itemize/enumerate subbody}{size=\normalsize}
|
||||
\setbeamerfont{itemize/enumerate subsubbody}{size=\small}
|
||||
|
||||
% make pixelated bullets
|
||||
\setbeamertemplate{itemize item}{
|
||||
\tikz{
|
||||
\draw[fill=SecondaryColor,draw=none] (0, 0) rectangle(0.1, 0.1);
|
||||
\draw[fill=SecondaryColor,draw=none] (0.1, 0.1) rectangle(0.2, 0.2);
|
||||
\draw[fill=SecondaryColor,draw=none] (0, 0.2) rectangle(0.1, 0.3);
|
||||
}
|
||||
}
|
||||
\setbeamertemplate{itemize subitem}{
|
||||
\tikz{
|
||||
\draw[fill=SecondaryColor,draw=none] (0, 0) rectangle(0.075, 0.075);
|
||||
\draw[fill=SecondaryColor,draw=none] (0.075, 0.075) rectangle(0.15, 0.15);
|
||||
\draw[fill=SecondaryColor,draw=none] (0, 0.15) rectangle(0.075, 0.225);
|
||||
}
|
||||
}
|
||||
\setbeamertemplate{itemize subsubitem}{
|
||||
\tikz{
|
||||
\draw[fill=SecondaryColor,draw=none] (0.050, 0.050) rectangle(0.15, 0.15);
|
||||
}
|
||||
}
|
||||
|
||||
% disable navigation
|
||||
\setbeamertemplate{navigation symbols}{}
|
||||
|
||||
% disable the damn default logo!
|
||||
\setbeamertemplate{sidebar right}{}
|
||||
|
||||
% custom draw the title page above
|
||||
\setbeamertemplate{title page}{}
|
||||
|
||||
% again, manually draw the frame title above
|
||||
\setbeamertemplate{frametitle}{}
|
||||
|
||||
% disable "Figure:" in the captions
|
||||
% TODO: somehow this doesn't work for md-generated slides
|
||||
%\setbeamertemplate{caption}{\tiny\insertcaption}
|
||||
%\setbeamertemplate{caption label separator}{}
|
||||
|
||||
% add some space below the footnotes so they don't end up on the progress bar
|
||||
\setbeamertemplate{footnote}{
|
||||
\parindent 0em
|
||||
\noindent
|
||||
\raggedright
|
||||
\hbox to 0.8em{\hfil\insertfootnotemark}
|
||||
\insertfootnotetext
|
||||
\par
|
||||
\vspace{2em}
|
||||
}
|
||||
|
||||
% add the same vspace both before and after quotes
|
||||
\setbeamertemplate{quote begin}{\vspace{0.5em}}
|
||||
\setbeamertemplate{quote end}{\vspace{0.5em}}
|
||||
|
||||
% progress bar counters
|
||||
\newcounter{showProgressBar}
|
||||
\setcounter{showProgressBar}{1}
|
||||
\newcounter{showSlideNumbers}
|
||||
\setcounter{showSlideNumbers}{1}
|
||||
\newcounter{showSlideTotal}
|
||||
\setcounter{showSlideTotal}{1}
|
||||
|
||||
% use \makeatletter for our progress bar definitions
|
||||
% progress bar idea from http://tex.stackexchange.com/a/59749/44221
|
||||
% slightly adapted for visual purposes here
|
||||
\makeatletter
|
||||
\newcount\progressbar@tmpcounta% auxiliary counter
|
||||
\newcount\progressbar@tmpcountb% auxiliary counter
|
||||
\newdimen\progressbar@pbwidth %progressbar width
|
||||
\newdimen\progressbar@tmpdim % auxiliary dimension
|
||||
|
||||
\newdimen\slidewidth % auxiliary dimension
|
||||
\newdimen\slideheight % auxiliary dimension
|
||||
|
||||
% make the progress bar go across the screen
|
||||
\progressbar@pbwidth=\the\paperwidth
|
||||
\slidewidth=\the\paperwidth
|
||||
\slideheight=\the\paperheight
|
||||
|
||||
% draw everything with tikz
|
||||
\setbeamertemplate{background}{ % all slides
|
||||
% progress bar stuff
|
||||
\progressbar@tmpcounta=\insertframenumber
|
||||
\progressbar@tmpcountb=\inserttotalframenumber
|
||||
\progressbar@tmpdim=\progressbar@pbwidth
|
||||
\divide\progressbar@tmpdim by 100
|
||||
\multiply\progressbar@tmpdim by \progressbar@tmpcounta
|
||||
\divide\progressbar@tmpdim by \progressbar@tmpcountb
|
||||
\multiply\progressbar@tmpdim by 100
|
||||
|
||||
\begin{tikzpicture}
|
||||
% set up the entire slide as the canvas
|
||||
\useasboundingbox (0,0) rectangle(\the\paperwidth,\the\paperheight);
|
||||
|
||||
% background
|
||||
\fill[color=BackgroundColor] (0,0) rectangle(\the\paperwidth,\the\paperheight);
|
||||
|
||||
\ifnum\thepage=1\relax % only title slides
|
||||
% primary color rectangle
|
||||
\fill[color=PrimaryColor] (0, 4cm) rectangle(\slidewidth,\slideheight);
|
||||
|
||||
% text (title, subtitle, author, date)
|
||||
\node[anchor=south,text width=\slidewidth-1cm,inner xsep=0.5cm] at (0.5\slidewidth,4cm) {\color{BackgroundColor}\Huge\textbf{\inserttitle}};
|
||||
\node[anchor=north east,text width=\slidewidth-1cm,align=right] at (\slidewidth-0.4cm,4cm) {\color{PrimaryColor}\large\textbf{\insertsubtitle}};
|
||||
\node at (0.5\slidewidth,2cm) {\color{PrimaryColor}\LARGE\insertauthor};
|
||||
\node at (0.5\slidewidth,1.25cm) {\color{PrimaryColor}\Large\insertinstitute};
|
||||
\node[anchor=south east] at(\slidewidth,0cm) {\color{PrimaryColor}\tiny\insertdate};
|
||||
\else % other slides
|
||||
% title bar
|
||||
\fill[color=PrimaryColor] (0, \slideheight-1cm) rectangle(\slidewidth,\slideheight);
|
||||
|
||||
% slide title
|
||||
\node[anchor=north,text width=\slidewidth-0.75cm,inner xsep=0.5cm,inner ysep=0.25cm] at (0.5\slidewidth,\slideheight) {\color{BackgroundColor}\huge\textbf{\insertframetitle}};
|
||||
|
||||
% logo (TODO: autoscale; now it expects 350x350
|
||||
\node[anchor=north east] at (\slidewidth-0.25cm,\slideheight+0.06cm){\insertlogo};
|
||||
|
||||
% show progress bar
|
||||
\ifnum \value{showProgressBar}>0\relax%
|
||||
% progress bar icon in the middle of the screen
|
||||
\draw[fill=ProgBarBGColor,draw=none] (0cm,0cm) rectangle(\slidewidth,0.25cm);
|
||||
\draw[fill=PrimaryColor,draw=none] (0cm,0cm) rectangle(\progressbar@tmpdim,0.25cm);
|
||||
|
||||
% bottom info
|
||||
\node[anchor=south west] at(0cm,0.25cm) {\color{PrimaryColor}\tiny\vphantom{lp}\insertsection};
|
||||
% if slide numbers are active
|
||||
\ifnum \value{showSlideNumbers}>0\relax%
|
||||
% if slide totals are active
|
||||
\ifnum \value{showSlideTotal}>0\relax%
|
||||
% draw both slide number and slide total
|
||||
\node[anchor=south east] at(\slidewidth,0.25cm) {\color{PrimaryColor}\tiny\insertframenumber/\inserttotalframenumber};
|
||||
\else
|
||||
\node[anchor=south east] at(\slidewidth,0.25cm) {\color{PrimaryColor}\tiny\insertframenumber};
|
||||
\fi
|
||||
\fi
|
||||
\else
|
||||
% section title in the bottom left
|
||||
\node[anchor=south west] at(0cm,0cm) {\color{PrimaryColor}\tiny\vphantom{lp}\insertsection};
|
||||
% if we're showing slide numbers
|
||||
\ifnum \value{showSlideNumbers}>0\relax%
|
||||
% if slide totals are active
|
||||
\ifnum \value{showSlideTotal}>0\relax%
|
||||
% slide number and slide total
|
||||
\node[anchor=south east] at(\slidewidth,0cm) {\color{PrimaryColor}\tiny\insertframenumber/\inserttotalframenumber};
|
||||
\else
|
||||
\node[anchor=south east] at(\slidewidth,0cm) {\color{PrimaryColor}\tiny\insertframenumber};
|
||||
\fi
|
||||
\fi
|
||||
\fi
|
||||
\fi
|
||||
\end{tikzpicture}
|
||||
}
|
||||
\makeatother
|
||||
|
||||
\AtBeginSection{\frame{\sectionpage}} % section pages
|
||||
\setbeamertemplate{section page}
|
||||
{
|
||||
\begin{tikzpicture}
|
||||
% set up the entire slide as the canvas
|
||||
\useasboundingbox (0,0) rectangle(\slidewidth,\slideheight);
|
||||
\fill[color=BackgroundColor] (-1cm, 2cm) rectangle (\slidewidth, \slideheight+0.1cm);
|
||||
\fill[color=PrimaryColor] (-1cm, 0.5\slideheight-1cm) rectangle(\slidewidth, 0.5\slideheight+1cm);
|
||||
\node[text width=\the\paperwidth-1cm,align=center] at (0.4\slidewidth, 0.5\slideheight) {\color{BackgroundColor}\Huge\textbf{\insertsection}};
|
||||
\end{tikzpicture}
|
||||
}
|
||||
BIN
lectures/2025/lecture-n-1/gu.png
Normal file
|
After Width: | Height: | Size: 81 KiB |
BIN
lectures/2025/lecture-n-1/img/argmining.png
Normal file
|
After Width: | Height: | Size: 57 KiB |
BIN
lectures/2025/lecture-n-1/img/gfast.png
Normal file
|
After Width: | Height: | Size: 64 KiB |
BIN
lectures/2025/lecture-n-1/img/machamp.png
Normal file
|
After Width: | Height: | Size: 337 KiB |
BIN
lectures/2025/lecture-n-1/img/sets.png
Normal file
|
After Width: | Height: | Size: 160 KiB |
1
lectures/2025/lecture-n-1/img/sets.svg
Normal file
|
After Width: | Height: | Size: 51 KiB |
6
lectures/2025/lecture-n-1/img/ud.conllu
Normal file
@@ -0,0 +1,6 @@
|
||||
1 the the DET DT Definite=Def|PronType=Art 3 det _ TokenRange=0:3
|
||||
2 black black ADJ JJ Degree=Pos 3 amod _ TokenRange=4:9
|
||||
3 cat cat NOUN NN Number=Sing 4 nsubj _ TokenRange=10:13
|
||||
4 sees see VERB VBZ Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin 0 root _ TokenRange=14:18
|
||||
5 us we PRON PRP Case=Acc|Number=Plur|Person=1|PronType=Prs 4 obj _ TokenRange=19:21
|
||||
6 now now ADV RB PronType=Dem 4 advmod _ SpaceAfter=No|TokenRange=22:25
|
||||
51
lectures/2025/lecture-n-1/img/ud.svg
Normal file
@@ -0,0 +1,51 @@
|
||||
<svg width="317"
|
||||
height="115"
|
||||
viewBox="0 0 317 115"
|
||||
version="1.1"
|
||||
xmlns="http://www.w3.org/2000/svg">
|
||||
<text x="5" y="108" font-size="16">the</text>
|
||||
<text x="42" y="108" font-size="16">black</text>
|
||||
<text x="97" y="108" font-size="16">cat</text>
|
||||
<text x="143" y="108" font-size="16">sees</text>
|
||||
<text x="189" y="108" font-size="16">us</text>
|
||||
<text x="235" y="108" font-size="16">now</text>
|
||||
<text x="5" y="93" font-size="10">DET</text>
|
||||
<text x="42" y="93" font-size="10">ADJ</text>
|
||||
<text x="97" y="93" font-size="10">NOUN</text>
|
||||
<text x="143" y="93" font-size="10">VERB</text>
|
||||
<text x="189" y="93" font-size="10">PRON</text>
|
||||
<text x="235" y="93" font-size="10">ADV</text>
|
||||
<path d="M 17 80 Q 17 47 50 47 L 72 47 Q 105 47 105 80"
|
||||
stroke="black"
|
||||
fill="none"/>
|
||||
<line x1="17" y1="75" x2="17" y2="80" stroke="black"/>
|
||||
<path d="M 17 80 14 74 20 74"/>
|
||||
<text x="54" y="42" font-size="10">det</text>
|
||||
<path d="M 55 80 Q 55 63 71 63 L 88 63 Q 104 63 104 80"
|
||||
stroke="black"
|
||||
fill="none"/>
|
||||
<line x1="55" y1="75" x2="55" y2="80" stroke="black"/>
|
||||
<path d="M 55 80 52 74 58 74"/>
|
||||
<text x="71" y="58" font-size="10">amod</text>
|
||||
<path d="M 110 80 Q 110 63 127 63 L 133 63 Q 150 63 150 80"
|
||||
stroke="black"
|
||||
fill="none"/>
|
||||
<line x1="110" y1="75" x2="110" y2="80" stroke="black"/>
|
||||
<path d="M 110 80 107 74 113 74"/>
|
||||
<text x="119" y="58" font-size="10">nsubj</text>
|
||||
<line x1="158" y1="20" x2="158" y2="80" stroke="black"/>
|
||||
<path d="M 158 80 155 74 161 74"/>
|
||||
<text x="163" y="28" font-size="10">root</text>
|
||||
<path d="M 166 80 Q 166 63 183 63 L 189 63 Q 206 63 206 80"
|
||||
stroke="black"
|
||||
fill="none"/>
|
||||
<line x1="206" y1="75" x2="206" y2="80" stroke="black"/>
|
||||
<path d="M 206 80 203 74 209 74"/>
|
||||
<text x="179" y="58" font-size="10">obj</text>
|
||||
<path d="M 165 80 Q 165 47 198 47 L 220 47 Q 253 47 253 80"
|
||||
stroke="black"
|
||||
fill="none"/>
|
||||
<line x1="253" y1="75" x2="253" y2="80" stroke="black"/>
|
||||
<path d="M 253 80 250 74 256 74"/>
|
||||
<text x="195" y="42" font-size="10">advmod</text>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 2.1 KiB |
219
lectures/2025/lecture-n-1/slides.md
Normal file
@@ -0,0 +1,219 @@
|
||||
---
|
||||
title: "Training and evaluating \\newline dependency parsers"
|
||||
subtitle: "(added to the course by popular demand)"
|
||||
author: "Arianna Masciolini"
|
||||
theme: "lucid"
|
||||
logo: "gu.png"
|
||||
date: "VT25"
|
||||
institute: "LT2214 Computational Syntax"
|
||||
---
|
||||
|
||||
## Today's topic
|
||||
\bigskip \bigskip
|
||||

|
||||
|
||||
# Parsing
|
||||
|
||||
## A structured prediction task
|
||||
Sequence $\to$ structure, e.g.
|
||||
|
||||
- natural language sentence $\to$ syntax tree
|
||||
- code $\to$ AST
|
||||
- argumentative essay $\to$ argumentative structure
|
||||
- ...
|
||||
|
||||
## Example (argmining)
|
||||
|
||||
> Språkbanken has better fika than CLASP: every fika, someone bakes. Sure, CLASP has a better coffee machine. On the other hand, there are more important things than coffee. In fact, most people drink tea in the afternoon.
|
||||
|
||||
## Example (argmining)
|
||||

|
||||
|
||||
\footnotesize From "A gentle introduction to argumentation mining" (Lindahl et al., 2022)
|
||||
|
||||
# Syntactic parsing
|
||||
|
||||
## From sentence to tree
|
||||
From chapter 18 of _Speech and Language Processing_, (Jurafsky & Martin, January 2024 draft):
|
||||
|
||||
> Syntactic parsing is the task of assigning a syntactic structure to a sentence
|
||||
|
||||
- the structure is usually a _syntax tree_
|
||||
- two main classes of approaches:
|
||||
- constituency parsing (e.g. GF)
|
||||
- dependency parsing (e.g. UD)
|
||||
|
||||
## Example (GF)
|
||||
```
|
||||
MicroLang> i MicroLangEng.gf
|
||||
linking ... OK
|
||||
|
||||
Languages: MicroLangEng
|
||||
7 msec
|
||||
MicroLang> p "the black cat sees us now"
|
||||
PredVPS (DetCN the_Det (AdjCN (PositA black_A)
|
||||
(UseN cat_N))) (AdvVP (ComplV2 see_V2 (UsePron
|
||||
we_Pron)) now_Adv)
|
||||
```
|
||||
|
||||
## Example (GF)
|
||||
```haskell
|
||||
PredVPS
|
||||
(DetCN
|
||||
the_Det
|
||||
(AdjCN (PositA black_A) (UseN cat_N))
|
||||
)
|
||||
(AdvVP
|
||||
(ComplV2 see_V2 (UsePron we_Pron))
|
||||
now_Adv
|
||||
)
|
||||
```
|
||||
|
||||
## Example (GF)
|
||||

|
||||
|
||||
# Dependency parsing
|
||||
|
||||
## Example (UD)
|
||||

|
||||
|
||||
\small
|
||||
```
|
||||
1 the _ DET _ _ 3 det _ _
|
||||
2 black _ ADJ _ _ 3 amod _ _
|
||||
3 cat _ NOUN _ _ 4 nsubj _ _
|
||||
4 sees _ VERB _ _ 0 root _ _
|
||||
5 us _ PRON _ _ 4 obj _ _
|
||||
6 now _ ADV _ _ 4 advmod _ _
|
||||
```
|
||||
|
||||
## Two paradigms
|
||||
- __graph-based algorithms__: find the optimal tree from the set of all possible candidate solutions (or a subset of it)
|
||||
- __transition-based algorithms__: incrementally build a tree by solving a sequence of classification problems
|
||||
|
||||
## Graph-based approaches
|
||||
$$\hat{t} = \underset{t \in T(s)}{argmax}\, score(s,t)$$
|
||||
|
||||
- $t$: candidate tree
|
||||
- $\hat{t}$: predicted tree
|
||||
- $s$: input sentence
|
||||
- $T(s)$: set of candidate trees for $s$
|
||||
|
||||
## Complexity
|
||||
Depends on:
|
||||
|
||||
- choice of $T$ (upper bound: $n^{n-1}$, where $n$ is the number of words in $s$)
|
||||
- scoring function (in the __arc-factor model__, the score of a tree is the sum of the score of each edge, scored individually by a NN)
|
||||
|
||||
|
||||
In practice: $O(n^3)$ complexity
|
||||
|
||||
## Transition-based approaches
|
||||
- trees are built through a sequence of steps, called _transitions_
|
||||
- training requires:
|
||||
- a gold-standard treebank (as for graph-based approaches)
|
||||
- an _oracle_ i.e. an algorithm that converts each tree into a a gold-standard sequence of transitions
|
||||
- much more efficient: $O(n)$
|
||||
|
||||
## Evaluation
|
||||
2 main metrics:
|
||||
|
||||
- __UAS__ (Unlabelled Attachment Score): what's the fraction of nodes are attached to the correct dependency head?
|
||||
- __LAS__ (Labelled Attachment Score): what's the fraction of nodes are attached to the correct dependency head _with an arc labelled with the correct relation type_[^1]?
|
||||
|
||||
[^1]: in UD: the `DEPREL` column
|
||||
|
||||
# Specifics of UD parsing
|
||||
|
||||
## Not just parsing per se
|
||||
UD "parsers" typically do a lot more than dependency parsing:
|
||||
|
||||
- sentence segmentation
|
||||
- tokenization
|
||||
- lemmatization (`LEMMA` column)
|
||||
- POS tagging (`UPOS` + `XPOS`)
|
||||
- morphological tagging (`FEATS`)
|
||||
- ...
|
||||
|
||||
Sometimes, some of these tasks are performed __jointly__ to achieve better performance.
|
||||
|
||||
## Evaluation (UD-specific)
|
||||
Some more specific metrics:
|
||||
|
||||
- __CLAS__ (Content-word LAS): LAS limited to content words
|
||||
- __MLAS__ (Morphology-Aware LAS): CLAS that also uses the `FEATS` column
|
||||
- __BLEX__ (Bi-Lexical dependency score): CLAS that also uses the `LEMMA` column
|
||||
|
||||
## Evaluation script output
|
||||
\small
|
||||
```
|
||||
Metric | Precision | Recall | F1 Score | AligndAcc
|
||||
-----------+-----------+-----------+-----------+-----------
|
||||
Tokens | 100.00 | 100.00 | 100.00 |
|
||||
Sentences | 100.00 | 100.00 | 100.00 |
|
||||
Words | 100.00 | 100.00 | 100.00 |
|
||||
UPOS | 98.36 | 98.36 | 98.36 | 98.36
|
||||
XPOS | 100.00 | 100.00 | 100.00 | 100.00
|
||||
UFeats | 100.00 | 100.00 | 100.00 | 100.00
|
||||
AllTags | 98.36 | 98.36 | 98.36 | 98.36
|
||||
Lemmas | 100.00 | 100.00 | 100.00 | 100.00
|
||||
UAS | 92.73 | 92.73 | 92.73 | 92.73
|
||||
LAS | 90.30 | 90.30 | 90.30 | 90.30
|
||||
CLAS | 88.50 | 88.34 | 88.42 | 88.34
|
||||
MLAS | 86.72 | 86.56 | 86.64 | 86.56
|
||||
BLEX | 88.50 | 88.34 | 88.42 | 88.34
|
||||
```
|
||||
|
||||
## Three generations of parsers
|
||||
(all transition-based)
|
||||
|
||||
1. __MaltParser__ (Nivre et al. 2006): "classic" transition-based parser, data-driven but not NN-based
|
||||
2. __UDPipe__: neural parser, personal favorite
|
||||
- v1 (Straka et al. 2016): fast, solid software, easy to install and available anywhere
|
||||
- v2 (Straka et al. 2018): much better results but slower and only available through an API/via the web GUI
|
||||
3. __MaChAmp__ (van der Goot et al. 2021): transformer-based toolkit for multi-task learning, works on all CoNNL-like data, close to the SOTA, relatively easy to install and train
|
||||
|
||||
## MaChAmp config example
|
||||
```json
|
||||
{"compsyn": {
|
||||
"train_data_path": "PATH-TO-YOUR-TRAIN-SPLIT",
|
||||
"dev_data_path": "PATH-TO-YOUR-DEV-SPLIT",
|
||||
"word_idx": 1,
|
||||
"tasks": {
|
||||
"upos": {
|
||||
"task_type": "seq",
|
||||
"column_idx": 3
|
||||
},
|
||||
"dependency": {
|
||||
"task_type": "dependency",
|
||||
"column_idx": 6}}}}
|
||||
```
|
||||
|
||||
## Your task (lab 3)
|
||||

|
||||
|
||||
1. annotate a small treebank for your language of choice (started yesterday)
|
||||
2. __train a parser-tagger on a reference UD treebank__ (tomorrow, or maybe even today: installation)
|
||||
3. evaluate it on your treebank
|
||||
|
||||
# To learn more
|
||||
|
||||
## Main sources
|
||||
- chapters 18-19 of the January 2024 draft of _Speech and Language Processing_ (Jurafsky & Martin) (full text available [__here__](https://web.stanford.edu/~jurafsky/slp3/))
|
||||
- unit 3-2 of Johansson & Kuhlmann's course "Deep Learning for Natural Language Processing" ([__slides and videos__](https://liu-nlp.ai/dl4nlp/modules/module3/))
|
||||
- section 10.9.2 on parser evaluation from Aarne's course notes (on Canvas)
|
||||
|
||||
## Papers describing the parsers
|
||||
- _MaltParser: A Data-Driven Parser-Generator for Dependency Parsing_ (Nivre et al. 2006) ([__PDF__](http://lrec-conf.org/proceedings/lrec2006/pdf/162_pdf.pdf))
|
||||
- _UDPipe: Trainable Pipeline for Processing CoNLL-U Files Performing Tokenization, Morphological Analysis, POS Tagging and Parsing_ (Straka et al. 2016) ([__PDF__](https://aclanthology.org/L16-1680.pdf))
|
||||
- _UDPipe 2.0 Prototype at CoNLL 2018 UD Shared Task_ (Straka et al. 2018) ([__PDF__](https://aclanthology.org/K18-2020.pdf))
|
||||
- _Massive Choice, Ample Tasks (MACHAMP): A Toolkit for Multi-task Learning in NLP_ (van der Goot et al., 2021) ([__PDF__](https://arxiv.org/pdf/2005.14672))
|
||||
|
||||
## CSE courses you may like
|
||||
1. [DIT231](https://www.gu.se/en/study-gothenburg/programming-language-technology-dit231) Programming language technology
|
||||
- build a complete compiler
|
||||
2. [DIT301](https://www.gu.se/en/study-gothenburg/compiler-construction-dit301) Compiler construction
|
||||
- the hardcore version of 1.
|
||||
- build another compiler _and optimize it_
|
||||
3. DIT247 Machine learning for NLP (?)
|
||||
- has a module on dependency parsing similar to the one in "Deep Learning for Natural Language Processing"
|
||||