github: https://github.com/sheffieldnlp/stance-conditional
Python re模块
 ipython
FastText:Library for fast text representation and classification.
gensim中文教程
 gensim英文教程

gensim

激活日志

1 2	>>> import logging >>> logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

语料库创建

>>> from gensim import corpora, models, similarities
>>>
>>> corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
>>>           [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
>>>           [(8, 1.0), (10, 1.0), (11, 1.0)]]

初始化转换并索引

>>> tfidf = models.TfidfModel(corpus)
>>> vec = [(0, 1), (4, 1)]
>>> print(tfidf[vec])
[(0, 0.8075244), (4, 0.5898342)]
>>> index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
>>> sims = index[tfidf[vec]]
>>> print(list(enumerate(sims)))
[(0, 0.4662244), (1, 0.19139354), (2, 0.24600551), (3, 0.82094586), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]

tokenize

>>> # 去除停用词并分词
>>> # 译者注：这里只是例子，实际上还有其他停用词
>>> #         处理中文时，请借助 Py结巴分词 https://github.com/fxsjy/jieba
>>> stoplist = set('for a of the and to in'.split())
>>> texts = [[word for word in document.lower().split() if word not in stoplist]
>>>          for document in documents]
>>>
>>> # 去除仅出现一次的单词
>>> from collections import defaultdict
>>> frequency = defaultdict(int)
>>> for text in texts:
>>>     for token in text:
>>>         frequency[token] += 1
>>>
>>> texts = [[token for token in text if frequency[token] > 1]
>>>          for text in texts]
>>>
>>> from pprint import pprint   # pretty-printer
>>> pprint(texts)
[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

总结属性字典

>>> dictionary = corpora.Dictionary(texts)
>>> dictionary.save('/tmp/deerwester.dict') # 把字典保存起来，方便以后使用
>>> print(dictionary)
Dictionary(12 unique tokens)

产生稀疏文档向量

>>> new_doc = "Human computer interaction"
>>> new_vec = dictionary.doc2bow(new_doc.lower().split())
>>> print(new_vec) # "interaction"没有在dictionary中出现，因此忽略
[(0, 1), (1, 1)]
>>> corpus = [dictionary.doc2bow(text) for text in texts]
>>> corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # 存入硬盘，以备后需
>>> print(corpus)
[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]

语料库流——一次一个文档

>>> class MyCorpus(object):
>>>     def __iter__(self):
>>>         for line in open('mycorpus.txt'):
>>>             # assume there's one document per line, tokens separated by whitespace
>>>             yield dictionary.doc2bow(line.lower().split())
	
>>> corpus_memory_friendly = MyCorpus() # 没有将整个语料库载入内存
>>> print(corpus_memory_friendly)
<__main__.MyCorpus object at 0x10d5690>
>>> for vector in corpus_memory_friendly: # 一次读入内存一个向量
...     print(vector)
[(0, 1), (1, 1), (2, 1)]
[(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)]
[(2, 1), (5, 1), (7, 1), (8, 1)]
[(1, 1), (5, 2), (8, 1)]
[(3, 1), (6, 1), (7, 1)]
[(9, 1)]
[(9, 1), (10, 1)]
[(9, 1), (10, 1), (11, 1)]
[(4, 1), (10, 1), (11, 1)]
>>> # 收集所有符号的统计信息
>>> dictionary = corpora.Dictionary(line.lower().split() for line in open('mycorpus.txt'))
>>> # 收集停用词和仅出现一次的词的id
>>> stop_ids = [dictionary.token2id[stopword] for stopword in stoplist
>>>             if stopword in dictionary.token2id]
>>> once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]
>>> dictionary.filter_tokens(stop_ids + once_ids) # 删除停用词和仅出现一次的词
>>> dictionary.compactify() # 消除id序列在删除词后产生的不连续的缺口
>>> print(dictionary)
Dictionary(12 unique tokens)

存储语料库

>>> from gensim import corpora
>>> # 创建一个玩具级的语料库
>>> corpus = [[(1, 0.5)], []]  # 让一个文档为空，作为它的heck
>>>
>>> corpora.MmCorpus.serialize('/tmp/corpus.mm', corpus)
>>> corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
>>> corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
>>> corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
>>> corpus = corpora.MmCorpus('/tmp/corpus.mm')
>>> print(corpus)
MmCorpus(2 documents, 2 features, 1 non-zero entries)
>>> # 将语料库全部导入内存的方法
>>> print(list(corpus)) # 调用list()将会把所有的序列转换为普通Python List
[[(1, 0.5)], []]
>>> # 另一种利用流接口，一次只打印一个文档
>>> for doc in corpus:
...     print(doc)
[(1, 0.5)]
[]
>>> corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)

与NumPy和SciPy的兼容性

>>> corpus = gensim.matutils.Dense2Corpus(numpy_matrix)
>>> numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)
>>> corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)
>>> scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)

向量转换

>>> from gensim import corpora, models, similarities
>>> dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
>>> corpus = corpora.MmCorpus('/tmp/deerwester.mm')
>>> print(corpus)
MmCorpus(9 documents, 12 features, 28 non-zero entries)
>>> tfidf = models.TfidfModel(corpus) # 第一步 -- 初始化一个模型
>>> doc_bow = [(0, 1), (1, 1)]
>>> print(tfidf[doc_bow]) # 第二步 -- 使用模型转换向量
[(0, 0.70710678), (1, 0.70710678)]
>>> corpus_tfidf = tfidf[corpus]
>>> for doc in corpus_tfidf:
...     print(doc)
[(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)]
[(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.32448702061385548), (6, 0.44424552527467476), (7, 0.32448702061385548)]
[(2, 0.5710059809418182), (5, 0.41707573620227772), (7, 0.41707573620227772), (8, 0.5710059809418182)]
[(1, 0.49182558987264147), (5, 0.71848116070837686), (8, 0.49182558987264147)]
[(3, 0.62825804686700459), (6, 0.62825804686700459), (7, 0.45889394536615247)]
[(9, 1.0)]
[(9, 0.70710678118654746), (10, 0.70710678118654746)]
[(9, 0.50804290089167492), (10, 0.50804290089167492), (11, 0.69554641952003704)]
[(4, 0.62825804686700459), (10, 0.45889394536615247), (11, 0.62825804686700459)]
>>> lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # 初始化一个LSI转换
>>> corpus_lsi = lsi[corpus_tfidf] # 在原始语料库上加上双重包装: bow->tfidf->fold-in-lsi 转换可以序列化
>>> lsi.print_topics(2)
topic #0(1.594): -0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"response" + -0.060*"time" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"
topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"
>>> for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
...     print(doc)
[(0, -0.066), (1, 0.520)] # "Human machine interface for lab abc computer applications"
[(0, -0.197), (1, 0.761)] # "A survey of user opinion of computer system response time"
[(0, -0.090), (1, 0.724)] # "The EPS user interface management system"
[(0, -0.076), (1, 0.632)] # "System and human system engineering testing of EPS"
[(0, -0.102), (1, 0.574)] # "Relation of user perceived response time to error measurement"
[(0, -0.703), (1, -0.161)] # "The generation of random binary unordered trees"
[(0, -0.877), (1, -0.168)] # "The intersection graph of paths in trees"
[(0, -0.910), (1, -0.141)] # "Graph minors IV Widths of trees and well quasi ordering"
[(0, -0.617), (1, 0.054)] # "Graph minors A survey"
>>> lsi.save('/tmp/model.lsi') # same for tfidf, lda, ...
>>> lsi = models.LsiModel.load('/tmp/model.lsi')

可用的转换

#词频-逆文档频（Term Frequency * Inverse Document Frequency， Tf-Idf）
 >>> model = tfidfmodel.TfidfModel(bow_corpus, normalize=True)
 
#潜在语义索引（Latent Semantic Indexing，LSI，or sometimes LSA）
 >>> model = lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)
 >>> model.add_documents(another_tfidf_corpus) # 现在LSI已经使用tfidf_corpus + another_tfidf_corpus进行过训练了
>>> lsi_vec = model[tfidf_vec] # 将新文档转化到LSI空间不会影响该模型
>>> ...
>>> model.add_documents(more_documents) # tfidf_corpus + another_tfidf_corpus + more_documents
>>> lsi_vec = model[tfidf_vec]
>>> ...
#随机映射（Random Projections，RP）
>>> model = rpmodel.RpModel(tfidf_corpus, num_topics=500)
#隐含狄利克雷分配（Latent Dirichlet Allocation, LDA）
>>> model = ldamodel.LdaModel(bow_corpus, id2word=dictionary, num_topics=100)
#分层狄利克雷过程（Hierarchical Dirichlet Process，HDP）
>>> model = hdpmodel.HdpModel(bow_corpus, id2word=dictionary)

m2vmodel.syn0??

Code

Linux Shell

Linux查找大文件

find . -type f -size +800M
find . -type f -size +800M  -print0 | xargs -0 ls -l
find . -type f -size +800M  -print0 | xargs -0 du -h
find . -type f -size +800M  -print0 | xargs -0 du -h | sort -nr
du -h --max-depth=1
du -h --max-depth=2 | sort -n
du -hm --max-depth=2 | sort -n
du -hm --max-depth=2 | sort -nr | head -12

Linux查看大文本

head [-n number] filename

word2vec
基于 Hierarchical Softmax 的模型
```
CBOW
Skip-gram
```
基于 Negative Sampling 的模型

按行读取目录中的文件

>>> class MySentences(object):
...     def __init__(self, dirname):
...         self.dirname = dirname
... 
...     def __iter__(self):
...         for fname in os.listdir(self.dirname):
...             for line in open(os.path.join(self.dirname, fname)):
...                 yield line.split()
>>>
>>> sentences = MySentences('/some/directory') # a memory-friendly iterator
>>> model = gensim.models.Word2Vec(sentences)

Python yield 的作用
Python yield浅析
```
简单地讲，yield 的作用就是把一个函数变成一个 generator，带有 yield 的函数
```
不再是一个普通函数，Python 解释器会将其视为一个 generator，调用 fab(5) 不
会执行 fab 函数，而是返回一个 iterable 对象！在 for 循环执行时，每次循环都
会执行 fab 函数内部的代码，执行到 yield b 时，fab 函数就返回一个迭代值，
下次迭代时，代码从 yield b 的下一条语句继续执行，而函数的本地变量看起来
和上次中断执行前是完全一样的，于是函数继续执行，直到再次遇到 yield。
gensim.models.Word2Vec
官方文档
 参数说明：
class gensim.models.word2vec.Word2Vec(sentences=None,size=100,alpha=0.025,window=5, min_count=5, max_vocab_size=None, sample=0.001,seed=1, workers=3,min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=,iter=5,null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000)

参数：

· sentences：可以是一个·ist，对于大语料集，建议使用BrownCorpus,Text8Corpus或·ineSentence构建。
· sg：用于设置训练算法，默认为0，对应CBOW算法；sg=1则采用skip-gram算法。
· size：是指特征向量的维度，默认为100。大的size需要更多的训练数据,但是效果会更好. 推荐值为几十到几百。
· window：表示当前词与预测词在一个句子中的最大距离是多少
· alpha: 是学习速率
· seed：用于随机数发生器。与初始化词向量有关。
· min_count: 可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5
· max_vocab_size: 设置词向量构建期间的RAM限制。如果所有独立单词个数超过这个，则就消除掉其中最不频繁的一个。每一千万个单词需要大约1GB的RAM。设置成None则没有限制。
· sample: 高频词汇的随机降采样的配置阈值，默认为1e-3，范围是(0,1e-5)
· workers参数控制训练的并行数。
· hs: 如果为1则会采用hierarchica·softmax技巧。如果设置为0（defau·t），则negative sampling会被使用。
· negative: 如果>0,则会采用negativesamp·ing，用于设置多少个noise words
· cbow_mean: 如果为0，则采用上下文词向量的和，如果为1（defau·t）则采用均值。只有使用CBOW的时候才起作用。
· hashfxn： hash函数来初始化权重。默认使用python的hash函数
· iter：迭代次数，默认为5
· trim_rule：用于设置词汇表的整理规则，指定那些单词要留下，哪些要被删除。可以设置为None（min_count会被使用）或者一个接受()并返回RU·E_DISCARD,uti·s.RU·E_KEEP或者uti·s.RU·E_DEFAU·T的函数。
· sorted_vocab：如果为1（defau·t），则在分配word index 的时候会先对单词基于频率降序排序。
· batch_words：每一批的传递给线程的单词的数量，默认为10000
model.most_similar(word, topn=top):

Python中的一些关键字

call: 将类类实例可以变成一个可调用对象

class Person(object):
    def __init__(self, name, gender):
        self.name = name
        self.gender = gender

    def __call__(self, friend):
        print 'My name is %s...' % self.name
        print 'My friend is %s...' % friend

    >>> p = Person('Bob', 'male')
    >>> p('Tim')
    My name is Bob...
    My friend is Tim...

[__slots__](http://www.cnblogs.com/superxuezhazha/p/5793458.html): __slots__是指一个类允许的属性列表

    class Student(object):
        __slots__ = ('name', 'gender', 'score')
        def __init__(self, name, gender, score):
            self.name = name
            self.gender = gender
            self.score = score

[@property](http://www.cnblogs.com/superxuezhazha/p/5793450.html):Python装饰器

函数列表
```python
readInputAndEval(tests, outfile, hid, max_epochs, “tanh”, drop, “most”, str(i), modelt, w2v, acc_thresh=1)

"""
hidden_size = [100] #[50, 55, 60]
#acc_tresh = 1.0
max_epochs = 8
w2v = "big" #small
modeltype = ["bicond"]
stopwords = ["most"]
dropout = ["true"]
testsetting = ["weaklySup"]
pretrain = ["pre_cont"]
outfile = "../out/results_batch70_2_morehash3_ep7_9-1e-3-" + tests + "_" + modelt + "_w2v" + w2v + "_hidd" + str(hid) + "_drop" + drop + "_" + pre + "_" + str(i) + ".txt"
 readInputAndEval("weaklySup", outfile, 100, 8, "tanh", drop = true , "most", str(i), modelt = "bicond" , w2v = big, acc_thresh=1)
"""

def readInputAndEval(testSetting, outfile, hidden_size, max_epochs, tanhOrSoftmax, dropout, stopwords=”most”, testid=”test1”, modeltype=”bicond”, word2vecmodel=”small”, postprocess=True, shortenTargets=False, useAutoTrump=False, useClinton=True, acc_thresh=1.0, pretrain=”pre_cont”, usePhrases=False):

"""
Reading input files, calling the trainer for training the model, evaluate with official script
:param outfile: name for output file
:param stopwords: how to filter stopwords, see preprocess.filterStopwords()
:param postprocess: force against/favor for tweets which contain the target
:param shortenTargets: shorten the target text, see preprocess.transform_targets()
:param useAutoTrump: use automatically annotated Trump tweets, experimental, not helping at the moment
:param useClinton: add the Hillary Clinton dev data to train data
:param testSetting: evaluate on Trump
"""

test_trainer(“weaklySup”, w2vmodel, transformed_tweets , transformed_targets, transformed_labels, ids, transformed_tweets_test,

transformed_targets_test, transformed_labels_test, ids_test, hidden_size, max_epochs,
tanhOrSoftmax, dropout, modeltype, targetInTweet,
testid, acc_thresh=acc_thresh, pretrain=pretrain)

transformed_tweets –19维

transformed_targets –19维 Donald trump

transformed_labels –3维 one-hot

ids ： 0-18759

transformed_tweets_test –19维 707条

transformed_targets_test –19维 Donald trump 707条

transformed_labels_test–3维 one-hot

ids_test 20000- 20707

hidden_size –100

max_epochs = 8

tanhOrSoftmax –tanh

dropout =true

modeltype = bicond

targetInTweet id->true or false

testid = “test1”

acc_thresh = 0.9

pretrain = pre_cont

def test_trainer(testsetting, w2vmodel, tweets, targets, labels, ids, tweets_test, targets_test, labels_test, ids_test, hidden_size, max_epochs, tanhOrSoftmax, dropout, modeltype="conditional", targetInTweet={}, testid = "test-1", pretrain = "pre_cont", acc_thresh=0.9, sep = False):
    """
    Method for creating the different models and training them
    :param testsetting: "True" for SemEval test setting (Donald Trump), "False" for dev setting (Hillary Clinton)
    :param w2vmodel: location of word2vec model
    :param tweets: training tweets, read and converted in readInputAndEval()
    :param targets: training targets, read and converted in readInputAndEval()
    :param labels: training labels, read and converted in readInputAndEval()
    :param ids: ids of training instances
    :param tweets_test: testing tweets, read and converted in readInputAndEval()
    :param targets_test: testing targets, read and converted in readInputAndEval()
    :param labels_test: testing labels, read and converted in readInputAndEval()
    :param ids_test: ids of testing instances
    :param hidden_size: size of hidden layer
    :param max_epochs: maximum number of training epochs
    :param tanhOrSoftmax: tanh or softmax in projector
    :param dropout: use dropout or not
    :param modeltype: "concat", "tweetonly", "conditional", "conditional-reverse", "bicond", "conditional-target-feed", "bicond-sepembed"
    :param targetInTweet: dictionary produced with id to targetInTweet mappings in readInputAndEval(), used for postprocessing
    :param testid: id of test run
    :param pretrain: "pre" (use pretrained word embeddings), "pre_cont" (use pretrained word embeddings and continue training them), "random" (random word embeddings initialisations)
    :param acc_thresh: experimental, stop training at certain accuracy threshold (between 0 and 1)
    :param sep: True for using separate embeddings matrices, false for one (default)
    :return:
"""


 model, placeholders = get_model_bidirectional_conditioning(batch_size, max_seq_length, input_size, hidden_size, target_size,
                                                               vocab_size, pretrain, tanhOrSoftmax, dropout)
batch_size,=70
max_seq_length = 19
input_size = 100
hidden_size = 100
target_size = 3
vocab_size = 47859
pretrain = "pre_cont"
tanhOrSoftmax = "hanh"
dropout    = true

def get_model_bidirectional_conditioning(batch_size, max_seq_length, input_size, hidden_size, target_size,
                                     vocab_size, pretrain, tanhOrSoftmax, dropout):
"""
Bidirectional conditioning model
:param pretrain:  "pre": use pretrained word embeddings, "pre-cont": use pre-trained embeddings and continue training them, otherwise: random initialisation
"""

 Encoder(rnn_cell.BasicLSTMCell, input_size, hidden_size, drop_prob, drop_prob)





get_model_conditional(batch_size, max_seq_length, input_size, hidden_size, target_size,
                          vocab_size, pretrain, tanhOrSoftmax, dropout):
    """
    Unidirectional conditional encoding model
    :param pretrain:  "pre": use pretrained word embeddings, "pre-cont": use pre-trained embeddings and continue training them, otherwise: random initialisation
    """

```