问题：

使用Stanford CoreNLP(3.5.2)进行并发处理

缪坚诚

2023-03-14

java.util.ConcurrentModificationException
	at java.util.ArrayList$Itr.checkForComodification(ArrayList.java:901)
	at java.util.ArrayList$Itr.next(ArrayList.java:851)
	at java.util.Collections$UnmodifiableCollection$1.next(Collections.java:1042)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:463)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.analyzeNode(GrammaticalStructure.java:488)
	at edu.stanford.nlp.trees.GrammaticalStructure.<init>(GrammaticalStructure.java:201)
	at edu.stanford.nlp.trees.EnglishGrammaticalStructure.<init>(EnglishGrammaticalStructure.java:89)
	at edu.stanford.nlp.semgraph.SemanticGraphFactory.makeFromTree(SemanticGraphFactory.java:139)
	at edu.stanford.nlp.pipeline.DeterministicCorefAnnotator.annotate(DeterministicCorefAnnotator.java:89)
	at edu.stanford.nlp.pipeline.AnnotationPipeline.annotate(AnnotationPipeline.java:68)
	at edu.stanford.nlp.pipeline.StanfordCoreNLP.annotate(StanfordCoreNLP.java:412)

我附上了一个应用程序的示例代码，它在我的Core i3 370M笔记本电脑上（Win 7 64bit，Java 1.8.0.4564bit)在大约20秒内重现了这个问题。这个应用程序读取识别文本蕴涵(RTE)语料库的XML文件，然后使用标准Java并发类同时解析所有句子。本地RTE XML文件的路径需要作为命令行参数给出。在我的测试中，我使用了以下公开的XML文件:http://www.nist.gov/tac/data/rte/rte3-dev-final.tar.gz

package semante.parser.stanford.server;

import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.Unmarshaller;
import javax.xml.bind.annotation.XmlAccessType;
import javax.xml.bind.annotation.XmlAccessorType;
import javax.xml.bind.annotation.XmlAttribute;
import javax.xml.bind.annotation.XmlElement;
import javax.xml.bind.annotation.XmlRootElement;

import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;

public class StanfordMultiThreadingTest {

	@XmlRootElement(name = "entailment-corpus")
	@XmlAccessorType (XmlAccessType.FIELD)
	public static class Corpus {
		@XmlElement(name = "pair")
		private List<Pair> pairList = new ArrayList<Pair>();

		public void addPair(Pair p) {pairList.add(p);}
		public List<Pair> getPairList() {return pairList;}
	}

	@XmlRootElement(name="pair")
	public static class Pair {

		@XmlAttribute(name = "id")
		String id;

		@XmlAttribute(name = "entailment")
		String entailment;

		@XmlElement(name = "t")
		String t;

		@XmlElement(name = "h")
		String h;

		private Pair() {}

		public Pair(int id, boolean entailment, String t, String h) {
			this();
			this.id = Integer.toString(id);
			this.entailment = entailment ? "YES" : "NO";
			this.t = t;
			this.h = h;
		}

		public String getId() {return id;}
		public String getEntailment() {return entailment;}
		public String getT() {return t;}
		public String getH() {return h;}
	}
	
	class NullStream extends OutputStream {
		@Override 
		public void write(int b) {}
	};

	private Corpus corpus;
	private Unmarshaller unmarshaller;
	private ExecutorService executor;

	public StanfordMultiThreadingTest() throws Exception {
		javax.xml.bind.JAXBContext jaxbCtx = JAXBContext.newInstance(Pair.class,Corpus.class);
		unmarshaller = jaxbCtx.createUnmarshaller();
		executor = Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
	}

	public void readXML(String fileName) throws Exception {
		System.out.println("Reading XML - Started");
		corpus = (Corpus) unmarshaller.unmarshal(new InputStreamReader(new FileInputStream(fileName), StandardCharsets.UTF_8));
		System.out.println("Reading XML - Ended");
	}

	public void parseSentences() throws Exception {
		System.out.println("Parsing - Started");

		// turn pairs into a list of sentences
		List<String> sentences = new ArrayList<String>();
		for (Pair pair : corpus.getPairList()) {
			sentences.add(pair.getT());
			sentences.add(pair.getH());
		}

		// prepare the properties
		final Properties props = new Properties();
		props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");

		// first run is long since models are loaded
		new StanfordCoreNLP(props);

		// to avoid the CoreNLP initialization prints (e.g. "Adding annotation pos")
		final PrintStream nullPrintStream = new PrintStream(new NullStream());
		PrintStream err = System.err;
		System.setErr(nullPrintStream);

		int totalCount = sentences.size();
		AtomicInteger counter = new AtomicInteger(0);

		// use java concurrency to parallelize the parsing
		for (String sentence : sentences) {
			executor.execute(new Runnable() {
				@Override
				public void run() {
					try {
						StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
						Annotation annotation = new Annotation(sentence);
						pipeline.annotate(annotation);
						if (counter.incrementAndGet() % 20 == 0) {
							System.out.println("Done: " + String.format("%.2f", counter.get()*100/(double)totalCount));
						};
					} catch (Exception e) {
						System.setErr(err);
						e.printStackTrace();
						System.setErr(nullPrintStream);
						executor.shutdownNow();
					}
				}
			});
		}
		executor.shutdown();
		
		System.out.println("Waiting for parsing to end.");		
		executor.awaitTermination(10, TimeUnit.MINUTES);

		System.out.println("Parsing - Ended");
	}

	public static void main(String[] args) throws Exception {
		StanfordMultiThreadingTest smtt = new StanfordMultiThreadingTest();
		smtt.readXML(args[0]);
		smtt.parseSentences();
	}

}

在我试图找到一些背景信息时，我遇到了斯坦福大学的Christopher Manning和Gabor Angeli给出的答案，这些答案表明斯坦福CoreNLP的当代版本应该是线程安全的。但是，CoreNLP Version3.4.1上最近的一个bug报告描述了一个并发问题。正如标题中提到的，我使用的是3.5.2版本。

我不清楚我面临的问题是由于bug还是由于我使用软件包的方式有问题。如果更有见识的人能对此有所了解，我将不胜感激。我希望示例代码将有助于再现该问题。谢了！

周鸿云

2023-03-14

您是否尝试过使用threads选项？您可以为单个StanfordCorenlp管道指定多个线程，然后它将并行处理语句。

例如，如果要在8个核上处理句子，请将Threads选项设置为8:

Properties props = new Properties();
props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref");
props.put("threads", "8")
StanfordCoreNLP pipeline  = new StanfordCoreNLP(props);

尽管如此，我认为您的解决方案也应该可以工作，我们将检查是否存在一些并发错误，但使用此选项可能会同时解决您的问题。

使用Stanford CoreNLP(3.5.2)进行并发处理

共有1个答案

相关问答

相关文章

相关阅读

相关工具

相关文档