Stance_Detection_with_Conditional_Encoding

github: https://github.com/sheffieldnlp/stance-conditional
Python re模块
ipython
FastText:Library for fast text representation and classification.
gensim中文教程
gensim英文教程

gensim

  1. 激活日志

    1
    2
    >>> import logging
    >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
  2. 语料库创建

    1
    2
    3
    4
    5
    >>> from gensim import corpora, models, similarities
    >>>
    >>> corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
    >>> [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
    >>> [(8, 1.0), (10, 1.0), (11, 1.0)]]
  3. 初始化转换并索引

    1
    2
    3
    4
    5
    6
    7
    8
    >>> tfidf = models.TfidfModel(corpus)
    >>> vec = [(0, 1), (4, 1)]
    >>> print(tfidf[vec])
    [(0, 0.8075244), (4, 0.5898342)]
    >>> index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
    >>> sims = index[tfidf[vec]]
    >>> print(list(enumerate(sims)))
    [(0, 0.4662244), (1, 0.19139354), (2, 0.24600551), (3, 0.82094586), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]
  4. tokenize

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    >>> # 去除停用词并分词
    >>> # 译者注:这里只是例子,实际上还有其他停用词
    >>> # 处理中文时,请借助 Py结巴分词 https://github.com/fxsjy/jieba
    >>> stoplist = set('for a of the and to in'.split())
    >>> texts = [[word for word in document.lower().split() if word not in stoplist]
    >>> for document in documents]
    >>>
    >>> # 去除仅出现一次的单词
    >>> from collections import defaultdict
    >>> frequency = defaultdict(int)
    >>> for text in texts:
    >>> for token in text:
    >>> frequency[token] += 1
    >>>
    >>> texts = [[token for token in text if frequency[token] > 1]
    >>> for text in texts]
    >>>
    >>> from pprint import pprint # pretty-printer
    >>> pprint(texts)
    [['human', 'interface', 'computer'],
    ['survey', 'user', 'computer', 'system', 'response', 'time'],
    ['eps', 'user', 'interface', 'system'],
    ['system', 'human', 'system', 'eps'],
    ['user', 'response', 'time'],
    ['trees'],
    ['graph', 'trees'],
    ['graph', 'minors', 'trees'],
    ['graph', 'minors', 'survey']]
  5. 总结属性字典

    1
    2
    3
    4
    >>> dictionary = corpora.Dictionary(texts)
    >>> dictionary.save('/tmp/deerwester.dict') # 把字典保存起来,方便以后使用
    >>> print(dictionary)
    Dictionary(12 unique tokens)
  6. 产生稀疏文档向量

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    >>> new_doc = "Human computer interaction"
    >>> new_vec = dictionary.doc2bow(new_doc.lower().split())
    >>> print(new_vec) # "interaction"没有在dictionary中出现,因此忽略
    [(0, 1), (1, 1)]
    >>> corpus = [dictionary.doc2bow(text) for text in texts]
    >>> corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # 存入硬盘,以备后需
    >>> print(corpus)
    [(0, 1), (1, 1), (2, 1)]
    [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
    [(2, 1), (5, 1), (7, 1), (8, 1)]
    [(1, 1), (5, 2), (8, 1)]
    [(3, 1), (6, 1), (7, 1)]
    [(9, 1)]
    [(9, 1), (10, 1)]
    [(9, 1), (10, 1), (11, 1)]
    [(4, 1), (10, 1), (11, 1)]
  7. 语料库流——一次一个文档

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    >>> class MyCorpus(object):
    >>> def __iter__(self):
    >>> for line in open('mycorpus.txt'):
    >>> # assume there's one document per line, tokens separated by whitespace
    >>> yield dictionary.doc2bow(line.lower().split())
    >>> corpus_memory_friendly = MyCorpus() # 没有将整个语料库载入内存
    >>> print(corpus_memory_friendly)
    <__main__.MyCorpus object at 0x10d5690>
    >>> for vector in corpus_memory_friendly: # 一次读入内存一个向量
    ... print(vector)
    [(0, 1), (1, 1), (2, 1)]
    [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
    [(2, 1), (5, 1), (7, 1), (8, 1)]
    [(1, 1), (5, 2), (8, 1)]
    [(3, 1), (6, 1), (7, 1)]
    [(9, 1)]
    [(9, 1), (10, 1)]
    [(9, 1), (10, 1), (11, 1)]
    [(4, 1), (10, 1), (11, 1)]
    >>> # 收集所有符号的统计信息
    >>> dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))
    >>> # 收集停用词和仅出现一次的词的id
    >>> stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
    >>> if stopword in dictionary.token2id]
    >>> once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
    >>> dictionary.filter_tokens(stop_ids + once_ids) # 删除停用词和仅出现一次的词
    >>> dictionary.compactify() # 消除id序列在删除词后产生的不连续的缺口
    >>> print(dictionary)
    Dictionary(12 unique tokens)
  8. 存储语料库

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    >>> from gensim import corpora
    >>> # 创建一个玩具级的语料库
    >>> corpus = [[(1, 0.5)], []] # 让一个文档为空,作为它的heck
    >>>
    >>> corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
    >>> corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
    >>> corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
    >>> corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
    >>> corpus = corpora.MmCorpus('/tmp/corpus.mm')
    >>> print(corpus)
    MmCorpus(2 documents, 2 features, 1 non-zero entries)
    >>> # 将语料库全部导入内存的方法
    >>> print(list(corpus)) # 调用list()将会把所有的序列转换为普通Python List
    [[(1, 0.5)], []]
    >>> # 另一种利用流接口,一次只打印一个文档
    >>> for doc in corpus:
    ... print(doc)
    [(1, 0.5)]
    []
    >>> corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
  9. 与NumPy和SciPy的兼容性

    1
    2
    3
    4
    5
    >>> corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
    >>> numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)
    >>> corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
    >>> scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)
  10. 向量转换

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    >>> from gensim import corpora, models, similarities
    >>> dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
    >>> corpus = corpora.MmCorpus('/tmp/deerwester.mm')
    >>> print(corpus)
    MmCorpus(9 documents, 12 features, 28 non-zero entries)
    >>> tfidf = models.TfidfModel(corpus) # 第一步 -- 初始化一个模型
    >>> doc_bow = [(0, 1), (1, 1)]
    >>> print(tfidf[doc_bow]) # 第二步 -- 使用模型转换向量
    [(0, 0.70710678), (1, 0.70710678)]
    >>> corpus_tfidf = tfidf[corpus]
    >>> for doc in corpus_tfidf:
    ... print(doc)
    [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)]
    [(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.32448702061385548), (6, 0.44424552527467476), (7, 0.32448702061385548)]
    [(2, 0.5710059809418182), (5, 0.41707573620227772), (7, 0.41707573620227772), (8, 0.5710059809418182)]
    [(1, 0.49182558987264147), (5, 0.71848116070837686), (8, 0.49182558987264147)]
    [(3, 0.62825804686700459), (6, 0.62825804686700459), (7, 0.45889394536615247)]
    [(9, 1.0)]
    [(9, 0.70710678118654746), (10, 0.70710678118654746)]
    [(9, 0.50804290089167492), (10, 0.50804290089167492), (11, 0.69554641952003704)]
    [(4, 0.62825804686700459), (10, 0.45889394536615247), (11, 0.62825804686700459)]
    >>> lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # 初始化一个LSI转换
    >>> corpus_lsi = lsi[corpus_tfidf] # 在原始语料库上加上双重包装: bow->tfidf->fold-in-lsi 转换可以序列化
    >>> lsi.print_topics(2)
    topic #0(1.594): -0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"response" + -0.060*"time" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"
    topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"
    >>> for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
    ... print(doc)
    [(0, -0.066), (1, 0.520)] # "Human machine interface for lab abc computer applications"
    [(0, -0.197), (1, 0.761)] # "A survey of user opinion of computer system response time"
    [(0, -0.090), (1, 0.724)] # "The EPS user interface management system"
    [(0, -0.076), (1, 0.632)] # "System and human system engineering testing of EPS"
    [(0, -0.102), (1, 0.574)] # "Relation of user perceived response time to error measurement"
    [(0, -0.703), (1, -0.161)] # "The generation of random binary unordered trees"
    [(0, -0.877), (1, -0.168)] # "The intersection graph of paths in trees"
    [(0, -0.910), (1, -0.141)] # "Graph minors IV Widths of trees and well quasi ordering"
    [(0, -0.617), (1, 0.054)] # "Graph minors A survey"
    >>> lsi.save('/tmp/model.lsi') # same for tfidf, lda, ...
    >>> lsi = models.LsiModel.load('/tmp/model.lsi')
  11. 可用的转换

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    #词频-逆文档频(Term Frequency * Inverse Document Frequency, Tf-Idf)
    >>> model = tfidfmodel.TfidfModel(bow_corpus, normalize=True)
    #潜在语义索引(Latent Semantic Indexing,LSI,or sometimes LSA)
    >>> model = lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
    >>> model.add_documents(another_tfidf_corpus) # 现在LSI已经使用tfidf_corpus + another_tfidf_corpus进行过训练了
    >>> lsi_vec = model[tfidf_vec] # 将新文档转化到LSI空间不会影响该模型
    >>> ...
    >>> model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents
    >>> lsi_vec = model[tfidf_vec]
    >>> ...
    #随机映射(Random Projections,RP)
    >>> model = rpmodel.RpModel(tfidf_corpus, num_topics=500)
    #隐含狄利克雷分配(Latent Dirichlet Allocation, LDA)
    >>> model = ldamodel.LdaModel(bow_corpus, id2word=dictionary, num_topics=100)
    #分层狄利克雷过程(Hierarchical Dirichlet Process,HDP)
    >>> model = hdpmodel.HdpModel(bow_corpus, id2word=dictionary)
  12. m2vmodel.syn0??

Code

  1. Linux Shell

Linux查找大文件

find . -type f -size +800M
find . -type f -size +800M  -print0 | xargs -0 ls -l
find . -type f -size +800M  -print0 | xargs -0 du -h
find . -type f -size +800M  -print0 | xargs -0 du -h | sort -nr
du -h --max-depth=1
du -h --max-depth=2 | sort -n
du -hm --max-depth=2 | sort -n
du -hm --max-depth=2 | sort -nr | head -12

Linux查看大文本

head [-n number] filename
  1. word2vec
    基于 Hierarchical Softmax 的模型

    CBOW
    Skip-gram
    

    基于 Negative Sampling 的模型

  2. 按行读取目录中的文件

    >>> class MySentences(object):
    ...     def __init__(self, dirname):
    ...         self.dirname = dirname
    ... 
    ...     def __iter__(self):
    ...         for fname in os.listdir(self.dirname):
    ...             for line in open(os.path.join(self.dirname, fname)):
    ...                 yield line.split()
    >>>
    >>> sentences = MySentences('/some/directory') # a memory-friendly iterator
    >>> model = gensim.models.Word2Vec(sentences)
    
  3. Python yield 的作用
    Python yield浅析

    简单地讲,yield 的作用就是把一个函数变成一个 generator,带有 yield 的函数
    

    不再是一个普通函数,Python 解释器会将其视为一个 generator,调用 fab(5) 不
    会执行 fab 函数,而是返回一个 iterable 对象!在 for 循环执行时,每次循环都
    会执行 fab 函数内部的代码,执行到 yield b 时,fab 函数就返回一个迭代值,
    下次迭代时,代码从 yield b 的下一条语句继续执行,而函数的本地变量看起来
    和上次中断执行前是完全一样的,于是函数继续执行,直到再次遇到 yield。

  4. gensim.models.Word2Vec
    官方文档
    参数说明
    class gensim.models.word2vec.Word2Vec(sentences=None,size=100,alpha=0.025,window=5, min_count=5, max_vocab_size=None, sample=0.001,seed=1, workers=3,min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=,iter=5,null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000)

    参数:

    · sentences:可以是一个·ist,对于大语料集,建议使用BrownCorpus,Text8Corpus或·ineSentence构建。
    · sg: 用于设置训练算法,默认为0,对应CBOW算法;sg=1则采用skip-gram算法。
    · size:是指特征向量的维度,默认为100。大的size需要更多的训练数据,但是效果会更好. 推荐值为几十到几百。
    · window:表示当前词与预测词在一个句子中的最大距离是多少
    · alpha: 是学习速率
    · seed:用于随机数发生器。与初始化词向量有关。
    · min_count: 可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5
    · max_vocab_size: 设置词向量构建期间的RAM限制。如果所有独立单词个数超过这个,则就消除掉其中最不频繁的一个。每一千万个单词需要大约1GB的RAM。设置成None则没有限制。
    · sample: 高频词汇的随机降采样的配置阈值,默认为1e-3,范围是(0,1e-5)
    · workers参数控制训练的并行数。
    · hs: 如果为1则会采用hierarchica·softmax技巧。如果设置为0(defau·t),则negative sampling会被使用。
    · negative: 如果>0,则会采用negativesamp·ing,用于设置多少个noise words
    · cbow_mean: 如果为0,则采用上下文词向量的和,如果为1(defau·t)则采用均值。只有使用CBOW的时候才起作用。
    · hashfxn: hash函数来初始化权重。默认使用python的hash函数
    · iter: 迭代次数,默认为5
    · trim_rule: 用于设置词汇表的整理规则,指定那些单词要留下,哪些要被删除。可以设置为None(min_count会被使用)或者一个接受()并返回RU·E_DISCARD,uti·s.RU·E_KEEP或者uti·s.RU·E_DEFAU·T的函数。
    · sorted_vocab: 如果为1(defau·t),则在分配word index 的时候会先对单词基于频率降序排序。
    · batch_words:每一批的传递给线程的单词的数量,默认为10000

  5. model.most_similar(word, topn=top):

  6. Python中的一些关键字

    call: 将类类实例可以变成一个可调用对象

    class Person(object):
        def __init__(self, name, gender):
            self.name = name
            self.gender = gender
    
        def __call__(self, friend):
            print 'My name is %s...' % self.name
            print 'My friend is %s...' % friend
    
    >>> p = Person('Bob', 'male')
    >>> p('Tim')
    My name is Bob...
    My friend is Tim...

[__slots__](http://www.cnblogs.com/superxuezhazha/p/5793458.html): __slots__是指一个类允许的属性列表

    class Student(object):
        __slots__ = ('name', 'gender', 'score')
        def __init__(self, name, gender, score):
            self.name = name
            self.gender = gender
            self.score = score

[@property](http://www.cnblogs.com/superxuezhazha/p/5793450.html):Python装饰器
  1. 函数列表
    ```python
    readInputAndEval(tests, outfile, hid, max_epochs, “tanh”, drop, “most”, str(i), modelt, w2v, acc_thresh=1)

    """
    hidden_size = [100] #[50, 55, 60]
    #acc_tresh = 1.0
    max_epochs = 8
    w2v = "big" #small
    modeltype = ["bicond"]
    stopwords = ["most"]
    dropout = ["true"]
    testsetting = ["weaklySup"]
    pretrain = ["pre_cont"]
    outfile = "../out/results_batch70_2_morehash3_ep7_9-1e-3-" + tests + "_" + modelt + "_w2v" + w2v + "_hidd" + str(hid) + "_drop" + drop + "_" + pre + "_" + str(i) + ".txt"
     readInputAndEval("weaklySup", outfile, 100, 8, "tanh", drop = true , "most", str(i), modelt = "bicond" , w2v = big, acc_thresh=1)
    """        
    

    def readInputAndEval(testSetting, outfile, hidden_size, max_epochs, tanhOrSoftmax, dropout, stopwords=”most”, testid=”test1”, modeltype=”bicond”, word2vecmodel=”small”, postprocess=True, shortenTargets=False, useAutoTrump=False, useClinton=True, acc_thresh=1.0, pretrain=”pre_cont”, usePhrases=False):

    """
    Reading input files, calling the trainer for training the model, evaluate with official script
    :param outfile: name for output file
    :param stopwords: how to filter stopwords, see preprocess.filterStopwords()
    :param postprocess: force against/favor for tweets which contain the target
    :param shortenTargets: shorten the target text, see preprocess.transform_targets()
    :param useAutoTrump: use automatically annotated Trump tweets, experimental, not helping at the moment
    :param useClinton: add the Hillary Clinton dev data to train data
    :param testSetting: evaluate on Trump
    """
    

    test_trainer(“weaklySup”, w2vmodel, transformed_tweets , transformed_targets, transformed_labels, ids, transformed_tweets_test,

    transformed_targets_test, transformed_labels_test, ids_test, hidden_size, max_epochs,
    tanhOrSoftmax, dropout, modeltype, targetInTweet,
    testid, acc_thresh=acc_thresh, pretrain=pretrain)
    

    transformed_tweets –19维

    transformed_targets –19维 Donald trump

    transformed_labels –3维 one-hot

    ids : 0-18759

    transformed_tweets_test –19维 707条

    transformed_targets_test –19维 Donald trump 707条

    transformed_labels_test–3维 one-hot

    ids_test 20000- 20707

    hidden_size –100

    max_epochs = 8

    tanhOrSoftmax –tanh

    dropout =true

    modeltype = bicond

    targetInTweet id->true or false

    testid = “test1”

    acc_thresh = 0.9

    pretrain = pre_cont

def test_trainer(testsetting, w2vmodel, tweets, targets, labels, ids, tweets_test, targets_test, labels_test, ids_test, hidden_size, max_epochs, tanhOrSoftmax, dropout, modeltype="conditional", targetInTweet={}, testid = "test-1", pretrain = "pre_cont", acc_thresh=0.9, sep = False):
    """
    Method for creating the different models and training them
    :param testsetting: "True" for SemEval test setting (Donald Trump), "False" for dev setting (Hillary Clinton)
    :param w2vmodel: location of word2vec model
    :param tweets: training tweets, read and converted in readInputAndEval()
    :param targets: training targets, read and converted in readInputAndEval()
    :param labels: training labels, read and converted in readInputAndEval()
    :param ids: ids of training instances
    :param tweets_test: testing tweets, read and converted in readInputAndEval()
    :param targets_test: testing targets, read and converted in readInputAndEval()
    :param labels_test: testing labels, read and converted in readInputAndEval()
    :param ids_test: ids of testing instances
    :param hidden_size: size of hidden layer
    :param max_epochs: maximum number of training epochs
    :param tanhOrSoftmax: tanh or softmax in projector
    :param dropout: use dropout or not
    :param modeltype: "concat", "tweetonly", "conditional", "conditional-reverse", "bicond", "conditional-target-feed", "bicond-sepembed"
    :param targetInTweet: dictionary produced with id to targetInTweet mappings in readInputAndEval(), used for postprocessing
    :param testid: id of test run
    :param pretrain: "pre" (use pretrained word embeddings), "pre_cont" (use pretrained word embeddings and continue training them), "random" (random word embeddings initialisations)
    :param acc_thresh: experimental, stop training at certain accuracy threshold (between 0 and 1)
    :param sep: True for using separate embeddings matrices, false for one (default)
    :return:
"""


 model, placeholders = get_model_bidirectional_conditioning(batch_size, max_seq_length, input_size, hidden_size, target_size,
                                                               vocab_size, pretrain, tanhOrSoftmax, dropout)
batch_size,=70
max_seq_length = 19
input_size = 100
hidden_size = 100
target_size = 3
vocab_size = 47859
pretrain = "pre_cont"
tanhOrSoftmax = "hanh"
dropout    = true

def get_model_bidirectional_conditioning(batch_size, max_seq_length, input_size, hidden_size, target_size,
                                     vocab_size, pretrain, tanhOrSoftmax, dropout):
"""
Bidirectional conditioning model
:param pretrain:  "pre": use pretrained word embeddings, "pre-cont": use pre-trained embeddings and continue training them, otherwise: random initialisation
"""

 Encoder(rnn_cell.BasicLSTMCell, input_size, hidden_size, drop_prob, drop_prob)





get_model_conditional(batch_size, max_seq_length, input_size, hidden_size, target_size,
                          vocab_size, pretrain, tanhOrSoftmax, dropout):
    """
    Unidirectional conditional encoding model
    :param pretrain:  "pre": use pretrained word embeddings, "pre-cont": use pre-trained embeddings and continue training them, otherwise: random initialisation
    """

```

-------------本文结束 感谢您的阅读-------------
作者GonewithGt
有问题请 留言 或者私信我的 微博
满分是10分的话,这篇文章你给几分