stanford coreNLP CRFClassifier 模型加载和序列化

Posted 2022-12-01 一休Q_Q

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了stanford coreNLP CRFClassifier 模型加载和序列化相关的知识，希望对你有一定的参考价值。

源代码位置：ie.crf.CRFClassifier
模型加载
 
  loadClassifier(String loadPath, Properties props)
  

 /**
 * Loads a classifier from the file, classpath resource, or URL specified by loadPath. If loadPath ends in
 * .gz, uses a GZIPInputStream.
 */
//seg here ,ner here
public void loadClassifier(String loadPath, Properties props) throws ClassCastException, IOException, ClassNotFoundException 
  InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(loadPath);
  Timing t = new Timing();
  loadClassifier(is, props);
  is.close();
  t.done(log, "Loading classifier from " + loadPath);

loadClassifier(ObjectInputStream ois, Properties props)
/**
   * Loads a classifier from the specified InputStream. This version works
   * quietly (unless VERBOSE is true). If props is non-null then any properties
   * it specifies override those in the serialized file. However, only some
   * properties are sensible to change (you shouldn't change how features are
   * defined).
   * <p>
   * <i>Note:</i> This method does not close the ObjectInputStream. (But earlier
   * versions of the code used to, so beware....)
   */
  @Override
  @SuppressWarnings(  "unchecked" )
  // can't have right types in deserialization
  //seg here,ner here
  public void loadClassifier(ObjectInputStream ois, Properties props) throws ClassCastException, IOException,
      ClassNotFoundException 
    Object o = ois.readObject();
    // TODO: when we next break serialization, get rid of this fork and only read the List<Index> (i.e., keep first case)
    if (o instanceof List) 
      labelIndices = (List<Index<CRFLabel>>) o;
     else 
      Index<CRFLabel>[] indexArray = (Index<CRFLabel>[]) o;
      labelIndices = new ArrayList<>(indexArray.length);
      Collections.addAll(labelIndices, indexArray);
    
    classIndex = (Index<String>) ois.readObject();
    featureIndex = (Index<String>) ois.readObject();
    flags = (SeqClassifierFlags) ois.readObject();
    if (flags.useEmbedding) 
      embeddings = (Map<String, double[]>) ois.readObject();
    
    Object featureFactory = ois.readObject();
    if (featureFactory instanceof List) 
      featureFactories = ErasureUtils.uncheckedCast(featureFactories);
//      int i = 0;
//      for (FeatureFactory ff : featureFactories)  // XXXX
//        System.err.println("List FF #" + i + ": " + ((NERFeatureFactory) ff).describeDistsimLexicon()); // XXXX
//        i++;
//      
     else if (featureFactory instanceof FeatureFactory) 
      featureFactories = Generics.newArrayList();
      featureFactories.add((FeatureFactory) featureFactory);
//      System.err.println(((NERFeatureFactory) featureFactory).describeDistsimLexicon()); // XXXX
     else if (featureFactory instanceof Integer) 
      // this is the current format (2014) since writing list didn't work (see note in serializeClassifier).
      int size = (Integer) featureFactory;
      featureFactories = Generics.newArrayList(size);
      for (int i = 0; i < size; ++i) 
        featureFactory = ois.readObject();
        if (!(featureFactory instanceof FeatureFactory)) 
          throw new RuntimeIOException("Should have FeatureFactory but got " + featureFactory.getClass());
        
//        System.err.println("FF #" + i + ": " + ((NERFeatureFactory) featureFactory).describeDistsimLexicon()); // XXXX
        featureFactories.add((FeatureFactory) featureFactory);
      
    

    // log.info("properties passed into CRF's loadClassifier are:" + props);
    if (props != null) 
      flags.setProperties(props, false);
    

    windowSize = ois.readInt();
    weights = (double[][]) ois.readObject();

    // WordShapeClassifier.setKnownLowerCaseWords((Set) ois.readObject());
    Set<String> lcWords = (Set<String>) ois.readObject();
    if (lcWords instanceof MaxSizeConcurrentHashSet) 
      knownLCWords = (MaxSizeConcurrentHashSet<String>) lcWords;
     else 
      knownLCWords = new MaxSizeConcurrentHashSet<>(lcWords);
    

    reinit();

    if (flags.labelDictionaryCutoff > 0) 
      labelDictionary = (LabelDictionary) ois.readObject();
    

    if (VERBOSE) 
      log.info("windowSize=" + windowSize);
      log.info("flags=\\n" + flags);
    
  

模型序列化
/**
 * Serialize the classifier to the given ObjectOutputStream.
 * <br>
 * (Since the classifier is a processor, we don't want to serialize the
 * whole classifier but just the data that represents a classifier model.)
 */
@Override
public void serializeClassifier(ObjectOutputStream oos) 
  try 
    oos.writeObject(labelIndices);
    oos.writeObject(classIndex);
    oos.writeObject(featureIndex);
    oos.writeObject(flags);
    if (flags.useEmbedding) 
      oos.writeObject(embeddings);
    
    // For some reason, writing out the array of FeatureFactory
    // objects doesn't seem to work.  The resulting classifier
    // doesn't have the lexicon (distsim object) correctly saved.  So now custom write the list
    oos.writeObject(featureFactories.size());
    for (FeatureFactory ff : featureFactories) 
      oos.writeObject(ff);
    
    oos.writeInt(windowSize);
    oos.writeObject(weights);
    // oos.writeObject(WordShapeClassifier.getKnownLowerCaseWords());

    oos.writeObject(knownLCWords);
    if (labelDictionary != null) 
      oos.writeObject(labelDictionary);
    
   catch (IOException e) 
    throw new RuntimeIOException(e);

以上是关于stanford coreNLP CRFClassifier 模型加载和序列化的主要内容，如果未能解决你的问题，请参考以下文章