edu.stanford.nlp.ling.MultiTokenTag



API Populatity

1 Client projects

Project: edu.stanford

Package: edu.stanford.nlp

Project stanfordnlp/CoreNLP in file ...anford.nlp.dcoref.RuleBasedCorefMentionFinder.java (2013-08-30)
@@ -8,7 +8,6 @@ import java.util.Set;
 import java.util.regex.Pattern;
 
 import edu.stanford.nlp.ling.CoreAnnotations;
-import edu.stanford.nlp.ling.MultiTokenTag;
 import edu.stanford.nlp.trees.TreeCoreAnnotations;
 import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.ling.Label;
@@ -83,7 +82,6 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
       Set<IntPair> mentionSpanSet = Generics.newHashSet();
       Set<IntPair> namedEntitySpanSet = Generics.newHashSet();
 
-      extractPremarkedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
       extractNamedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
       extractNPorPRP(s, mentions, mentionSpanSet, namedEntitySpanSet);
       extractEnumerations(s, mentions, mentionSpanSet, namedEntitySpanSet);
@@ -101,38 +99,6 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
     }
   }
 
-  protected void extractPremarkedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
-    List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
-    SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
-    int beginIndex = -1;
-    for(CoreLabel w : sent) {
-      MultiTokenTag t = w.get(CoreAnnotations.MentionTokenAnnotation.class);
-      if (t != null) {
-        // Part of a mention
-        if (t.isStart()) {
-          // Start of mention
-          beginIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
-        }
-        if (t.isEnd()) {
-          // end of mention
-          int endIndex = w.get(CoreAnnotations.IndexAnnotation.class);
-          if (beginIndex >= 0) {
-            IntPair mSpan = new IntPair(beginIndex, endIndex);
-            int mentionId = assignIds? ++maxID:-1;
-            Mention m = new Mention(mentionId, beginIndex, endIndex, dependency, new ArrayList<CoreLabel>(sent.subList(beginIndex, endIndex)));
-            mentions.add(m);
-            mentionSpanSet.add(mSpan);
-            beginIndex = -1;
-          } else {
-            SieveCoreferenceSystem.logger.warning("Start of marked mention not found in sentence: "
-                    + t + " at tokenIndex=" + (w.get(CoreAnnotations.IndexAnnotation.class)-1)+ " for "
-                    + s.get(CoreAnnotations.TextAnnotation.class));
-          }
-        }
-      }
-    }
-  }
-
   protected void extractNamedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
     List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
     SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
@@ -173,21 +139,20 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
     }
   }
 
-  private static final TregexPattern npOrPrpMentionPattern = TregexPattern.compile("/^(?:NP|PRP)/");
   protected void extractNPorPRP(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
     List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
     Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
     tree.indexLeaves();
     SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
 
-    TregexPattern tgrepPattern = npOrPrpMentionPattern;
+    final String mentionPattern = "/^(?:NP|PRP)/";
+    TregexPattern tgrepPattern = TregexPattern.compile(mentionPattern);
     TregexMatcher matcher = tgrepPattern.matcher(tree);
     while (matcher.find()) {
       Tree t = matcher.getMatch();
       List<Tree> mLeaves = t.getLeaves();
       int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
       int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
-      if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with ,
       IntPair mSpan = new IntPair(beginIdx, endIdx);
       if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) {
         int mentionID = assignIds? ++maxID:-1;
@@ -198,13 +163,13 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
     }
   }
   /** Extract enumerations (A, B, and C) */
-  private static final TregexPattern enumerationsMentionPattern = TregexPattern.compile("NP < (/^(?:NP|NNP|NML)/=m1 $.. (/^CC|,/ $.. /^(?:NP|NNP|NML)/=m2))");
   protected void extractEnumerations(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet){
     List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
     Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
     SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
 
-    TregexPattern tgrepPattern = enumerationsMentionPattern;
+    final String mentionPattern = "NP < (/^(?:NP|NNP|NML)/=m1 $.. (/^CC|,/ $.. /^(?:NP|NNP|NML)/=m2))";
+    TregexPattern tgrepPattern = TregexPattern.compile(mentionPattern);
     TregexMatcher matcher = tgrepPattern.matcher(tree);
     Map<IntPair, Tree> spanToMentionSubTree = Generics.newHashMap();
     while (matcher.find()) {
@@ -257,7 +222,7 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
                 + ": originalSpan=[" + StringUtils.joinWords(m.originalSpan, " ") + "], head=" + m.headWord);
         SieveCoreferenceSystem.logger.warning("Setting head string to entire mention");
         m.headIndex = m.startIndex;
-        m.headWord = m.originalSpan.size() > 0 ? m.originalSpan.get(0) : sent.get(m.startIndex);
+        m.headWord = m.originalSpan.get(0);
         m.headString = m.originalSpan.toString();
       }
     }
@@ -266,11 +231,9 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
   protected Tree findSyntacticHead(Mention m, Tree root, List<CoreLabel> tokens) {
     // mention ends with 's
     int endIdx = m.endIndex;
-    if (m.originalSpan.size() > 0) {
-      String lastWord = m.originalSpan.get(m.originalSpan.size()-1).get(CoreAnnotations.TextAnnotation.class);
-        if((lastWord.equals("'s") || lastWord.equals("'"))
-            && m.originalSpan.size() != 1 ) endIdx--;
-    }
+    String lastWord = m.originalSpan.get(m.originalSpan.size()-1).get(CoreAnnotations.TextAnnotation.class);
+    if((lastWord.equals("'s") || lastWord.equals("'"))
+        && m.originalSpan.size() != 1 ) endIdx--;
 
     Tree exactMatch = findTreeWithSpan(root, m.startIndex, endIdx);
     //
@@ -386,7 +349,7 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
     sent.set(CoreAnnotations.TokensAnnotation.class, tokens);
     sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints);
     Annotation doc = new Annotation("");
-    List<CoreMap> sents = new ArrayList<CoreMap>(1);
+    List<CoreMap> sents = new ArrayList<CoreMap>();
     sents.add(sent);
     doc.set(CoreAnnotations.SentencesAnnotation.class, sents);
     getParser().annotate(doc);
@@ -490,7 +453,7 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
       if(dict.nonWords.contains(m.headString)) remove.add(m);
 
       // quantRule : not starts with 'any', 'all' etc
-      if(m.originalSpan.size() > 0 && dict.quantifiers.contains(m.originalSpan.get(0).get(CoreAnnotations.TextAnnotation.class).toLowerCase())) remove.add(m);
+      if(dict.quantifiers.contains(m.originalSpan.get(0).get(CoreAnnotations.TextAnnotation.class).toLowerCase())) remove.add(m);
 
       // partitiveRule
       if(partitiveRule(m, sent, dict)) remove.add(m);
@@ -549,60 +512,52 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
   }
 
   /** Check whether pleonastic 'it'. E.g., It is possible that ... */
-  private static final TregexPattern[] pleonasticPatterns = getPleonasticPatterns();
   private static boolean isPleonastic(Mention m, Tree tree) {
     if ( ! m.spanToString().equalsIgnoreCase("it")) return false;
-    for (TregexPattern p : pleonasticPatterns) {
-      if (checkPleonastic(m, tree, p)) {
-        SieveCoreferenceSystem.logger.fine("RuleBasedCorefMentionFinder: matched pleonastic pattern '" + p + "' for " + tree);
-        return true;
-      }
-    }
-    return false;
-  }
-
-  private static TregexPattern[] getPleonasticPatterns() {
     final String[] patterns = {
-            // cdm 2013: I spent a while on these patterns. I fixed a syntax error in five patterns ($.. split with space), so it now shouldn't exception in checkPleonastic. This gave 0.02% on CoNLL11 dev
-            // I tried some more precise patterns but they didn't help. Indeed, they tended to hurt vs. the higher recall patterns.
+        // cdm 2013: I spent a while on these patterns. I fixed a syntax error in five patterns ($.. split with space), so it now shouldn't exception in checkPleonastic. This gave 0.02% on CoNLL11 dev
+        // I tried some more precise patterns but they didn't help. Indeed, they tended to hurt vs. the higher recall patterns.
 
-            //"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (VP < (VBN $.. /S|SBAR/))))", // overmatches
-            // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN < expected|hoped $.. @SBAR))))",  // this one seems more accurate, but ...
-            "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN $.. @S|SBAR))))",  // in practice, go with this one (best results)
+        //"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (VP < (VBN $.. /S|SBAR/))))", // overmatches
+        // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN < expected|hoped $.. @SBAR))))",  // this one seems more accurate, but ...
+        "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN $.. @S|SBAR))))",  // in practice, go with this one (best results)
 
-            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP $.. (/S|SBAR/))))",
-            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP < (/S|SBAR/))))",
-            // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@ADJP < (/^(?:JJ|VB)/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay)$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))", // does worse than above 2 on CoNLL11 dev
+        "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP $.. (/S|SBAR/))))",
+        "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP < (/S|SBAR/))))",
+        // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@ADJP < (/^(?:JJ|VB)/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay)$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))", // does worse than above 2 on CoNLL11 dev
 
-            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP < /S|SBAR/)))",
-            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP $.. ADVP $.. /S|SBAR/)))",
-            // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@NP $.. @ADVP $.. @SBAR)))", // cleft examples, generalized to not need ADVP; but gave worse CoNLL12 dev numbers....
+        "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP < /S|SBAR/)))",
+        "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP $.. ADVP $.. /S|SBAR/)))",
+        // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@NP $.. @ADVP $.. @SBAR)))", // cleft examples, generalized to not need ADVP; but gave worse CoNLL12 dev numbers....
 
-            // these next 5 had buggy space in "$ ..", which I fixed
-            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (VP < (VBN $.. /S|SBAR/))))))",
+        // these next 5 had buggy space in "$ ..", which I fixed
+        "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (VP < (VBN $.. /S|SBAR/))))))",
 
-            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP $.. (/S|SBAR/))))))", // extraposed. OK 1/2 correct; need non-adverbial case
-            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP < (/S|SBAR/))))))", // OK: 3/3 good matches on dev; but 3/4 wrong on WSJ
-            // certain can be either but relatively likely pleonastic with it ... be
-            // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (MD $.. (@VP < ((/^V.*/ < /^(?:be|become)/) $.. (@ADJP < (/^JJ/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay))$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))))", // GOOD REPLACEMENT ; 2nd clause is for extraposed ones
+        "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP $.. (/S|SBAR/))))))", // extraposed. OK 1/2 correct; need non-adverbial case
+        "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP < (/S|SBAR/))))))", // OK: 3/3 good matches on dev; but 3/4 wrong on WSJ
+        // certain can be either but relatively likely pleonastic with it ... be
+        // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (MD $.. (@VP < ((/^V.*/ < /^(?:be|become)/) $.. (@ADJP < (/^JJ/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay))$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))))", // GOOD REPLACEMENT ; 2nd clause is for extraposed ones
 
-            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP < /S|SBAR/)))))",
-            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP $.. ADVP $.. /S|SBAR/)))))",
+        "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP < /S|SBAR/)))))",
+        "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP $.. ADVP $.. /S|SBAR/)))))",
 
-            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:seems|appears|means|follows)/) $.. /S|SBAR/))",
+        "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:seems|appears|means|follows)/) $.. /S|SBAR/))",
 
-            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:turns|turned)/) $.. PRT $.. /S|SBAR/))"
+        "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:turns|turned)/) $.. PRT $.. /S|SBAR/))"
     };
 
-    TregexPattern[] tgrepPatterns = new TregexPattern[patterns.length];
-    for (int i = 0; i < tgrepPatterns.length; i++) {
-      tgrepPatterns[i] = TregexPattern.compile(patterns[i]);
+    for (String p : patterns) {
+      if (checkPleonastic(m, tree, p)) {
+        // System.err.printf("Found pleonastic: %s%n", tree);
+        return true;
+      }
     }
-    return tgrepPatterns;
+    return false;
   }
 
-  private static boolean checkPleonastic(Mention m, Tree tree, TregexPattern tgrepPattern) {
+  private static boolean checkPleonastic(Mention m, Tree tree, String pattern) {
     try {
+      TregexPattern tgrepPattern = TregexPattern.compile(pattern);
       TregexMatcher matcher = tgrepPattern.matcher(tree);
       while (matcher.find()) {
         Tree np1 = matcher.getNode("m1");
Project stanfordnlp/CoreNLP in file ....stanford.nlp.process.WordToSentenceProcessor.java (2013-08-19)
@@ -8,7 +8,6 @@ import edu.stanford.nlp.io.EncodingPrintWriter;
 import edu.stanford.nlp.ling.Document;
 import edu.stanford.nlp.ling.HasWord;
 import edu.stanford.nlp.ling.CoreAnnotations;
-import edu.stanford.nlp.ling.MultiTokenTag;
 import edu.stanford.nlp.util.CoreMap;
 import edu.stanford.nlp.util.Generics;
 
@@ -171,23 +170,14 @@ public class WordToSentenceProcessor<IN> implements ListProcessor<IN, List<IN>>
       }
 
       boolean forcedEnd = false;
-      boolean inMultiTokenExpr = false;
       if (o instanceof CoreMap) {
-        CoreMap cm = (CoreMap) o;
-        Boolean forcedEndValue = cm.get(CoreAnnotations.ForcedSentenceEndAnnotation.class);
+        Boolean forcedEndValue =
+          ((CoreMap)o).get(CoreAnnotations.ForcedSentenceEndAnnotation.class);
         if (forcedEndValue != null)
           forcedEnd = forcedEndValue;
-        else {
-          MultiTokenTag mt = cm.get(CoreAnnotations.MentionTokenAnnotation.class);
-          if (mt != null && !mt.isEnd()) {
-            // In the middle of a multi token mention, make sure sentence is not ended here
-            inMultiTokenExpr = true;
-          }
-        }
       }
 
-
-        if (DEBUG) {
+      if (DEBUG) {
         EncodingPrintWriter.err.println("Word is " + word, "UTF-8");
       }
       if (sentenceRegionBeginPattern != null && ! insideRegion) {
@@ -206,12 +196,7 @@ public class WordToSentenceProcessor<IN> implements ListProcessor<IN, List<IN>>
         }
       } else {
         boolean newSent = false;
-        if (inMultiTokenExpr) {
-          currentSentence.add(o);
-          if (DEBUG) {
-            System.err.println("  is in multi token expr; added to current");
-          }
-        } else if (matchesSentenceBoundaryToDiscard(word)) {
+        if (matchesSentenceBoundaryToDiscard(word)) {
           newSent = true;
         } else if (sentenceRegionEndPattern != null && sentenceRegionEndPattern.matcher(word).matches()) {
           insideRegion = false;
Project stanfordnlp/CoreNLP in file ...anford.nlp.dcoref.RuleBasedCorefMentionFinder.java (2014-06-10)
@@ -8,6 +8,7 @@ import java.util.Set;
 import java.util.regex.Pattern;
 
 import edu.stanford.nlp.ling.CoreAnnotations;
+import edu.stanford.nlp.ling.MultiTokenTag;
 import edu.stanford.nlp.trees.TreeCoreAnnotations;
 import edu.stanford.nlp.ling.CoreLabel;
 import edu.stanford.nlp.ling.Label;
@@ -82,6 +83,7 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
       Set<IntPair> mentionSpanSet = Generics.newHashSet();
       Set<IntPair> namedEntitySpanSet = Generics.newHashSet();
 
+      extractPremarkedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
       extractNamedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
       extractNPorPRP(s, mentions, mentionSpanSet, namedEntitySpanSet);
       extractEnumerations(s, mentions, mentionSpanSet, namedEntitySpanSet);
@@ -99,6 +101,38 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
     }
   }
 
+  protected void extractPremarkedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
+    List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
+    SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
+    int beginIndex = -1;
+    for(CoreLabel w : sent) {
+      MultiTokenTag t = w.get(CoreAnnotations.MentionTokenAnnotation.class);
+      if (t != null) {
+        // Part of a mention
+        if (t.isStart()) {
+          // Start of mention
+          beginIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
+        }
+        if (t.isEnd()) {
+          // end of mention
+          int endIndex = w.get(CoreAnnotations.IndexAnnotation.class);
+          if (beginIndex >= 0) {
+            IntPair mSpan = new IntPair(beginIndex, endIndex);
+            int mentionId = assignIds? ++maxID:-1;
+            Mention m = new Mention(mentionId, beginIndex, endIndex, dependency, new ArrayList<CoreLabel>(sent.subList(beginIndex, endIndex)));
+            mentions.add(m);
+            mentionSpanSet.add(mSpan);
+            beginIndex = -1;
+          } else {
+            SieveCoreferenceSystem.logger.warning("Start of marked mention not found in sentence: "
+                    + t + " at tokenIndex=" + (w.get(CoreAnnotations.IndexAnnotation.class)-1)+ " for "
+                    + s.get(CoreAnnotations.TextAnnotation.class));
+          }
+        }
+      }
+    }
+  }
+
   protected void extractNamedEntityMentions(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
     List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
     SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
@@ -139,20 +173,21 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
     }
   }
 
+  private static final TregexPattern npOrPrpMentionPattern = TregexPattern.compile("/^(?:NP|PRP)/");
   protected void extractNPorPRP(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet) {
     List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
     Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
     tree.indexLeaves();
     SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
 
-    final String mentionPattern = "/^(?:NP|PRP)/";
-    TregexPattern tgrepPattern = TregexPattern.compile(mentionPattern);
+    TregexPattern tgrepPattern = npOrPrpMentionPattern;
     TregexMatcher matcher = tgrepPattern.matcher(tree);
     while (matcher.find()) {
       Tree t = matcher.getMatch();
       List<Tree> mLeaves = t.getLeaves();
       int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
       int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
+      if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with ,
       IntPair mSpan = new IntPair(beginIdx, endIdx);
       if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) {
         int mentionID = assignIds? ++maxID:-1;
@@ -163,13 +198,13 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
     }
   }
   /** Extract enumerations (A, B, and C) */
+  private static final TregexPattern enumerationsMentionPattern = TregexPattern.compile("NP < (/^(?:NP|NNP|NML)/=m1 $.. (/^CC|,/ $.. /^(?:NP|NNP|NML)/=m2))");
   protected void extractEnumerations(CoreMap s, List<Mention> mentions, Set<IntPair> mentionSpanSet, Set<IntPair> namedEntitySpanSet){
     List<CoreLabel> sent = s.get(CoreAnnotations.TokensAnnotation.class);
     Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
     SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.CollapsedDependenciesAnnotation.class);
 
-    final String mentionPattern = "NP < (/^(?:NP|NNP|NML)/=m1 $.. (/^CC|,/ $.. /^(?:NP|NNP|NML)/=m2))";
-    TregexPattern tgrepPattern = TregexPattern.compile(mentionPattern);
+    TregexPattern tgrepPattern = enumerationsMentionPattern;
     TregexMatcher matcher = tgrepPattern.matcher(tree);
     Map<IntPair, Tree> spanToMentionSubTree = Generics.newHashMap();
     while (matcher.find()) {
@@ -222,7 +257,7 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
                 + ": originalSpan=[" + StringUtils.joinWords(m.originalSpan, " ") + "], head=" + m.headWord);
         SieveCoreferenceSystem.logger.warning("Setting head string to entire mention");
         m.headIndex = m.startIndex;
-        m.headWord = m.originalSpan.get(0);
+        m.headWord = m.originalSpan.size() > 0 ? m.originalSpan.get(0) : sent.get(m.startIndex);
         m.headString = m.originalSpan.toString();
       }
     }
@@ -231,9 +266,11 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
   protected Tree findSyntacticHead(Mention m, Tree root, List<CoreLabel> tokens) {
     // mention ends with 's
     int endIdx = m.endIndex;
-    String lastWord = m.originalSpan.get(m.originalSpan.size()-1).get(CoreAnnotations.TextAnnotation.class);
-    if((lastWord.equals("'s") || lastWord.equals("'"))
-        && m.originalSpan.size() != 1 ) endIdx--;
+    if (m.originalSpan.size() > 0) {
+      String lastWord = m.originalSpan.get(m.originalSpan.size()-1).get(CoreAnnotations.TextAnnotation.class);
+        if((lastWord.equals("'s") || lastWord.equals("'"))
+            && m.originalSpan.size() != 1 ) endIdx--;
+    }
 
     Tree exactMatch = findTreeWithSpan(root, m.startIndex, endIdx);
     //
@@ -349,7 +386,7 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
     sent.set(CoreAnnotations.TokensAnnotation.class, tokens);
     sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints);
     Annotation doc = new Annotation("");
-    List<CoreMap> sents = new ArrayList<CoreMap>();
+    List<CoreMap> sents = new ArrayList<CoreMap>(1);
     sents.add(sent);
     doc.set(CoreAnnotations.SentencesAnnotation.class, sents);
     getParser().annotate(doc);
@@ -453,7 +490,7 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
       if(dict.nonWords.contains(m.headString)) remove.add(m);
 
       // quantRule : not starts with 'any', 'all' etc
-      if(dict.quantifiers.contains(m.originalSpan.get(0).get(CoreAnnotations.TextAnnotation.class).toLowerCase())) remove.add(m);
+      if(m.originalSpan.size() > 0 && dict.quantifiers.contains(m.originalSpan.get(0).get(CoreAnnotations.TextAnnotation.class).toLowerCase())) remove.add(m);
 
       // partitiveRule
       if(partitiveRule(m, sent, dict)) remove.add(m);
@@ -512,52 +549,60 @@ public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
   }
 
   /** Check whether pleonastic 'it'. E.g., It is possible that ... */
+  private static final TregexPattern[] pleonasticPatterns = getPleonasticPatterns();
   private static boolean isPleonastic(Mention m, Tree tree) {
     if ( ! m.spanToString().equalsIgnoreCase("it")) return false;
+    for (TregexPattern p : pleonasticPatterns) {
+      if (checkPleonastic(m, tree, p)) {
+        SieveCoreferenceSystem.logger.fine("RuleBasedCorefMentionFinder: matched pleonastic pattern '" + p + "' for " + tree);
+        return true;
+      }
+    }
+    return false;
+  }
+
+  private static TregexPattern[] getPleonasticPatterns() {
     final String[] patterns = {
-        // cdm 2013: I spent a while on these patterns. I fixed a syntax error in five patterns ($.. split with space), so it now shouldn't exception in checkPleonastic. This gave 0.02% on CoNLL11 dev
-        // I tried some more precise patterns but they didn't help. Indeed, they tended to hurt vs. the higher recall patterns.
+            // cdm 2013: I spent a while on these patterns. I fixed a syntax error in five patterns ($.. split with space), so it now shouldn't exception in checkPleonastic. This gave 0.02% on CoNLL11 dev
+            // I tried some more precise patterns but they didn't help. Indeed, they tended to hurt vs. the higher recall patterns.
 
-        //"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (VP < (VBN $.. /S|SBAR/))))", // overmatches
-        // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN < expected|hoped $.. @SBAR))))",  // this one seems more accurate, but ...
-        "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN $.. @S|SBAR))))",  // in practice, go with this one (best results)
+            //"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (VP < (VBN $.. /S|SBAR/))))", // overmatches
+            // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN < expected|hoped $.. @SBAR))))",  // this one seems more accurate, but ...
+            "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN $.. @S|SBAR))))",  // in practice, go with this one (best results)
 
-        "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP $.. (/S|SBAR/))))",
-        "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP < (/S|SBAR/))))",
-        // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@ADJP < (/^(?:JJ|VB)/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay)$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))", // does worse than above 2 on CoNLL11 dev
+            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP $.. (/S|SBAR/))))",
+            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP < (/S|SBAR/))))",
+            // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@ADJP < (/^(?:JJ|VB)/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay)$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))", // does worse than above 2 on CoNLL11 dev
 
-        "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP < /S|SBAR/)))",
-        "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP $.. ADVP $.. /S|SBAR/)))",
-        // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@NP $.. @ADVP $.. @SBAR)))", // cleft examples, generalized to not need ADVP; but gave worse CoNLL12 dev numbers....
+            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP < /S|SBAR/)))",
+            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP $.. ADVP $.. /S|SBAR/)))",
+            // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@NP $.. @ADVP $.. @SBAR)))", // cleft examples, generalized to not need ADVP; but gave worse CoNLL12 dev numbers....
 
-        // these next 5 had buggy space in "$ ..", which I fixed
-        "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (VP < (VBN $.. /S|SBAR/))))))",
+            // these next 5 had buggy space in "$ ..", which I fixed
+            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (VP < (VBN $.. /S|SBAR/))))))",
 
-        "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP $.. (/S|SBAR/))))))", // extraposed. OK 1/2 correct; need non-adverbial case
-        "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP < (/S|SBAR/))))))", // OK: 3/3 good matches on dev; but 3/4 wrong on WSJ
-        // certain can be either but relatively likely pleonastic with it ... be
-        // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (MD $.. (@VP < ((/^V.*/ < /^(?:be|become)/) $.. (@ADJP < (/^JJ/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay))$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))))", // GOOD REPLACEMENT ; 2nd clause is for extraposed ones
+            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP $.. (/S|SBAR/))))))", // extraposed. OK 1/2 correct; need non-adverbial case
+            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP < (/S|SBAR/))))))", // OK: 3/3 good matches on dev; but 3/4 wrong on WSJ
+            // certain can be either but relatively likely pleonastic with it ... be
+            // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (MD $.. (@VP < ((/^V.*/ < /^(?:be|become)/) $.. (@ADJP < (/^JJ/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay))$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))))", // GOOD REPLACEMENT ; 2nd clause is for extraposed ones
 
-        "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP < /S|SBAR/)))))",
-        "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP $.. ADVP $.. /S|SBAR/)))))",
+            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP < /S|SBAR/)))))",
+            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP $.. ADVP $.. /S|SBAR/)))))",
 
-        "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:seems|appears|means|follows)/) $.. /S|SBAR/))",
+            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:seems|appears|means|follows)/) $.. /S|SBAR/))",
 
-        "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:turns|turned)/) $.. PRT $.. /S|SBAR/))"
+            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:turns|turned)/) $.. PRT $.. /S|SBAR/))"
     };
 
-    for (String p : patterns) {
-      if (checkPleonastic(m, tree, p)) {
-        // System.err.printf("Found pleonastic: %s%n", tree);
-        return true;
-      }
+    TregexPattern[] tgrepPatterns = new TregexPattern[patterns.length];
+    for (int i = 0; i < tgrepPatterns.length; i++) {
+      tgrepPatterns[i] = TregexPattern.compile(patterns[i]);
     }
-    return false;
+    return tgrepPatterns;
   }
 
-  private static boolean checkPleonastic(Mention m, Tree tree, String pattern) {
+  private static boolean checkPleonastic(Mention m, Tree tree, TregexPattern tgrepPattern) {
     try {
-      TregexPattern tgrepPattern = TregexPattern.compile(pattern);
       TregexMatcher matcher = tgrepPattern.matcher(tree);
       while (matcher.find()) {
         Tree np1 = matcher.getNode("m1");