Created
January 22, 2013 01:33
-
-
Save alextp/4591270 to your computer and use it in GitHub Desktop.
jinho features style
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def addFeature(v: SparseIndexedTensor1, f: String) { v.update(ClassifierPosFeatureDomain.index(f), 1.0) } | |
def addLemma(v: SparseIndexedTensor1, w: WordData, f: String, prefix: String) { | |
if (w.ambiguityClasses.contains(f)) addFeature(v, prefix+f) | |
} | |
def getAffinity(sent: SentenceData, w: WordData, pos: Int) { | |
val f = sent.get(sent.lemmas, pos) | |
if (w.ambiguityClasses.contains(f)) w.ambiguityClasses(f) else "" | |
} | |
def getLemmaFeature(sent: SentenceData, w: WordData, pos: Int, dif: Int) = { | |
val prefix = "W"+(dif)+"=" | |
val lemma = sent.get(sent.lemmas, pos+dif) | |
if (w.ambiguityClasses.contains(lemma)) | |
prefix+lemma | |
else | |
prefix | |
} | |
def addFeatures(sent: SentenceData, pos: Int, f: SparseIndexedTensor1, w: WordData) { | |
val wp3 = getLemmaFeature(sent, w, pos, +3) | |
val wp2 = getLemmaFeature(sent, w, pos, +2) | |
val wp1 = getLemmaFeature(sent, w, pos, +1) | |
val wf = getLemmaFeature(sent, w, pos, 0) | |
val wm1 = getLemmaFeature(sent, w, pos, -1) | |
val wm2 = getLemmaFeature(sent, w, pos, -2) | |
val wm3 = getLemmaFeature(sent, w, pos, -3) | |
val pm3 = "POS-3="+sent.get(sent.labels, pos-3) | |
val pm2 = "POS-2="+sent.get(sent.labels, pos-2) | |
val pm1 = "POS-1="+sent.get(sent.labels, pos-1) | |
val a0 = "A="+getAffinity(sent, w, pos) | |
val ap1 = "A+1="+getAffinity(sent, w, pos+1) | |
val ap2 = "A+2="+getAffinity(sent, w, pos+2) | |
val ap3 = "A+3="+getAffinity(sent, w, pos+3) | |
addFeature(f, wp3) | |
addFeature(f, wp2) | |
addFeature(f, wp1) | |
addFeature(f, wf) | |
addFeature(f, wm1) | |
addFeature(f, wm2) | |
addFeature(f, wm3) | |
addFeature(f, pm3) | |
addFeature(f, pm2) | |
addFeature(f, pm1) | |
addFeature(f, a0) | |
addFeature(f, ap1) | |
addFeature(f, ap2) | |
addFeature(f, ap2) | |
addFeature(f, wm2+wm1) | |
addFeature(f, wm1+wf) | |
addFeature(f, wf+wp1) | |
addFeature(f, wp1+wp2) | |
addFeature(f, wm1+wp1) | |
addFeature(f, pm2+pm1) | |
addFeature(f, ap1+ap2) | |
addFeature(f, pm1+ap1) | |
addFeature(f, pm1+a0) | |
addFeature(f, a0+ap1) | |
addFeature(f, wm2+wm1+wf) | |
addFeature(f, wm1+wf+wp1) | |
addFeature(f, wf+wp1+wp2) | |
addFeature(f, wm2+wm1+wp1) | |
addFeature(f, wm1+wp1+wp2) | |
addFeature(f, pm2+pm1+a0) | |
addFeature(f, pm1+a0+ap1) | |
addFeature(f, pm2+pm1+ap1) | |
addFeature(f, pm1+ap1+ap2) | |
addFeature(f, a0+ap1+ap2) | |
addFeature(f, "PREFX3="+wf.take(3)) | |
addFeature(f, "SUFX4="+wf.takeRight(4)) | |
addFeature(f, "Shape="+strings.stringShape(wf, 2)) // TODO(apassos): add the remaining jinho features not contained in shape | |
addFeature(f, "HasPeriod="+wf.contains(".")) | |
addFeature(f, "HasDigit="+wf.contains("0")) | |
addFeature(f, "HasHyphen="+wf.contains("-")) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment