github: https://github.com/sheffieldnlp/stance-conditional
Python re模块
ipython
FastText:Library for fast text representation and classification.
gensim中文教程
gensim英文教程
gensim
激活日志
12import logging'%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)logging.basicConfig(format=语料库创建
12345from gensim import corpora, models, similarities>>>0, 1.0), (1, 1.0), (2, 1.0)],corpus = [[(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],[(8, 1.0), (10, 1.0), (11, 1.0)]][(初始化转换并索引
12345678tfidf = models.TfidfModel(corpus)0, 1), (4, 1)]vec = [(print(tfidf[vec])[(0, 0.8075244), (4, 0.5898342)]12)index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=sims = index[tfidf[vec]]print(list(enumerate(sims)))[(0, 0.4662244), (1, 0.19139354), (2, 0.24600551), (3, 0.82094586), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0)]tokenize
12345678910111213141516171819202122232425262728# 去除停用词并分词# 译者注:这里只是例子,实际上还有其他停用词# 处理中文时,请借助 Py结巴分词 https://github.com/fxsjy/jieba'for a of the and to in'.split())stoplist = set(for word in document.lower().split() if word not in stoplist]texts = [[wordfor document in documents]>>># 去除仅出现一次的单词from collections import defaultdictfrequency = defaultdict(int)for text in texts:for token in text:1frequency[token] +=>>>for token in text if frequency[token] > 1]texts = [[tokenfor text in texts]>>>from pprint import pprint # pretty-printerpprint(texts)[['human', 'interface', 'computer'],['survey', 'user', 'computer', 'system', 'response', 'time'],['eps', 'user', 'interface', 'system'],['system', 'human', 'system', 'eps'],['user', 'response', 'time'],['trees'],['graph', 'trees'],['graph', 'minors', 'trees'],['graph', 'minors', 'survey']]总结属性字典
1234dictionary = corpora.Dictionary(texts)'/tmp/deerwester.dict') # 把字典保存起来,方便以后使用dictionary.save(print(dictionary)Dictionary(12 unique tokens)产生稀疏文档向量
12345678910111213141516"Human computer interaction"new_doc =new_vec = dictionary.doc2bow(new_doc.lower().split())# "interaction"没有在dictionary中出现,因此忽略print(new_vec)[(0, 1), (1, 1)]for text in texts]corpus = [dictionary.doc2bow(text)'/tmp/deerwester.mm', corpus) # 存入硬盘,以备后需corpora.MmCorpus.serialize(print(corpus)[(0, 1), (1, 1), (2, 1)][(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)][(2, 1), (5, 1), (7, 1), (8, 1)][(1, 1), (5, 2), (8, 1)][(3, 1), (6, 1), (7, 1)][(9, 1)][(9, 1), (10, 1)][(9, 1), (10, 1), (11, 1)][(4, 1), (10, 1), (11, 1)]语料库流——一次一个文档
1234567891011121314151617181920212223242526272829303132class MyCorpus(object):def __iter__(self):for line in open('mycorpus.txt'):# assume there's one document per line, tokens separated by whitespaceyield dictionary.doc2bow(line.lower().split())# 没有将整个语料库载入内存corpus_memory_friendly = MyCorpus()print(corpus_memory_friendly)<__main__.MyCorpus object at 0x10d5690>for vector in corpus_memory_friendly: # 一次读入内存一个向量print(vector)[(0, 1), (1, 1), (2, 1)][(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)][(2, 1), (5, 1), (7, 1), (8, 1)][(1, 1), (5, 2), (8, 1)][(3, 1), (6, 1), (7, 1)][(9, 1)][(9, 1), (10, 1)][(9, 1), (10, 1), (11, 1)][(4, 1), (10, 1), (11, 1)]# 收集所有符号的统计信息for line in open('mycorpus.txt'))dictionary = corpora.Dictionary(line.lower().split()# 收集停用词和仅出现一次的词的idfor stopword in stopliststop_ids = [dictionary.token2id[stopword]if stopword in dictionary.token2id]for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1]once_ids = [tokenid# 删除停用词和仅出现一次的词dictionary.filter_tokens(stop_ids + once_ids)# 消除id序列在删除词后产生的不连续的缺口dictionary.compactify()print(dictionary)Dictionary(12 unique tokens)存储语料库
1234567891011121314151617181920212223242526from gensim import corpora# 创建一个玩具级的语料库1, 0.5)], []] # 让一个文档为空,作为它的heckcorpus = [[(>>>'/tmp/corpus.mm', corpus)corpora.MmCorpus.serialize('/tmp/corpus.svmlight', corpus)corpora.SvmLightCorpus.serialize('/tmp/corpus.lda-c', corpus)corpora.BleiCorpus.serialize('/tmp/corpus.low', corpus)corpora.LowCorpus.serialize('/tmp/corpus.mm')corpus = corpora.MmCorpus(print(corpus)MmCorpus(2 documents, 2 features, 1 non-zero entries)# 将语料库全部导入内存的方法# 调用list()将会把所有的序列转换为普通Python Listprint(list(corpus))[[(1, 0.5)], []]# 另一种利用流接口,一次只打印一个文档for doc in corpus:print(doc)[(1, 0.5)][]'/tmp/corpus.lda-c', corpus)corpora.BleiCorpus.serialize(与NumPy和SciPy的兼容性
12345corpus = gensim.matutils.Dense2Corpus(numpy_matrix)numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=number_of_corpus_features)corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)向量转换
12345678910111213141516171819202122232425262728293031323334353637383940414243444546from gensim import corpora, models, similarities'/tmp/deerwester.dict')dictionary = corpora.Dictionary.load('/tmp/deerwester.mm')corpus = corpora.MmCorpus(print(corpus)MmCorpus(9 documents, 12 features, 28 non-zero entries)# 第一步 -- 初始化一个模型tfidf = models.TfidfModel(corpus)0, 1), (1, 1)]doc_bow = [(# 第二步 -- 使用模型转换向量print(tfidf[doc_bow])[(0, 0.70710678), (1, 0.70710678)]corpus_tfidf = tfidf[corpus]for doc in corpus_tfidf:print(doc)[(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)][(0, 0.44424552527467476), (3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.32448702061385548), (6, 0.44424552527467476), (7, 0.32448702061385548)][(2, 0.5710059809418182), (5, 0.41707573620227772), (7, 0.41707573620227772), (8, 0.5710059809418182)][(1, 0.49182558987264147), (5, 0.71848116070837686), (8, 0.49182558987264147)][(3, 0.62825804686700459), (6, 0.62825804686700459), (7, 0.45889394536615247)][(9, 1.0)][(9, 0.70710678118654746), (10, 0.70710678118654746)][(9, 0.50804290089167492), (10, 0.50804290089167492), (11, 0.69554641952003704)][(4, 0.62825804686700459), (10, 0.45889394536615247), (11, 0.62825804686700459)]2) # 初始化一个LSI转换lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=# 在原始语料库上加上双重包装: bow->tfidf->fold-in-lsi 转换可以序列化corpus_lsi = lsi[corpus_tfidf]2)lsi.print_topics(topic #0(1.594): -0.703*"trees" + -0.538*"graph" + -0.402*"minors" + -0.187*"survey" + -0.061*"system" + -0.060*"response" + -0.060*"time" + -0.058*"user" + -0.049*"computer" + -0.035*"interface"topic #1(1.476): -0.460*"system" + -0.373*"user" + -0.332*"eps" + -0.328*"interface" + -0.320*"response" + -0.320*"time" + -0.293*"computer" + -0.280*"human" + -0.171*"survey" + 0.161*"trees"for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the flyprint(doc)[(0, -0.066), (1, 0.520)] # "Human machine interface for lab abc computer applications"[(0, -0.197), (1, 0.761)] # "A survey of user opinion of computer system response time"[(0, -0.090), (1, 0.724)] # "The EPS user interface management system"[(0, -0.076), (1, 0.632)] # "System and human system engineering testing of EPS"[(0, -0.102), (1, 0.574)] # "Relation of user perceived response time to error measurement"[(0, -0.703), (1, -0.161)] # "The generation of random binary unordered trees"[(0, -0.877), (1, -0.168)] # "The intersection graph of paths in trees"[(0, -0.910), (1, -0.141)] # "Graph minors IV Widths of trees and well quasi ordering"[(0, -0.617), (1, 0.054)] # "Graph minors A survey"'/tmp/model.lsi') # same for tfidf, lda, ...lsi.save('/tmp/model.lsi')lsi = models.LsiModel.load(可用的转换
1234567891011121314151617181920#词频-逆文档频(Term Frequency * Inverse Document Frequency, Tf-Idf)>>> model = tfidfmodel.TfidfModel(bow_corpus, normalize=True)#潜在语义索引(Latent Semantic Indexing,LSI,or sometimes LSA)>>> model = lsimodel.LsiModel(tfidf_corpus, id2word=dictionary, num_topics=300)>>> model.add_documents(another_tfidf_corpus) # 现在LSI已经使用tfidf_corpus + another_tfidf_corpus进行过训练了# 将新文档转化到LSI空间不会影响该模型lsi_vec = model[tfidf_vec]...# tfidf_corpus + another_tfidf_corpus + more_documentsmodel.add_documents(more_documents)lsi_vec = model[tfidf_vec]...#随机映射(Random Projections,RP)500)model = rpmodel.RpModel(tfidf_corpus, num_topics=#隐含狄利克雷分配(Latent Dirichlet Allocation, LDA)100)model = ldamodel.LdaModel(bow_corpus, id2word=dictionary, num_topics=#分层狄利克雷过程(Hierarchical Dirichlet Process,HDP)model = hdpmodel.HdpModel(bow_corpus, id2word=dictionary)m2vmodel.syn0??
Code
- Linux Shell
find . -type f -size +800M
find . -type f -size +800M -print0 | xargs -0 ls -l
find . -type f -size +800M -print0 | xargs -0 du -h
find . -type f -size +800M -print0 | xargs -0 du -h | sort -nr
du -h --max-depth=1
du -h --max-depth=2 | sort -n
du -hm --max-depth=2 | sort -n
du -hm --max-depth=2 | sort -nr | head -12
head [-n number] filename
word2vec
基于 Hierarchical Softmax 的模型CBOW Skip-gram
基于 Negative Sampling 的模型
按行读取目录中的文件
class MySentences(object): def __init__(self, dirname): self.dirname = dirname def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname)): yield line.split() >>> sentences = MySentences('/some/directory') # a memory-friendly iterator model = gensim.models.Word2Vec(sentences)
Python yield 的作用
Python yield浅析简单地讲,yield 的作用就是把一个函数变成一个 generator,带有 yield 的函数
不再是一个普通函数,Python 解释器会将其视为一个 generator,调用 fab(5) 不
会执行 fab 函数,而是返回一个 iterable 对象!在 for 循环执行时,每次循环都
会执行 fab 函数内部的代码,执行到 yield b 时,fab 函数就返回一个迭代值,
下次迭代时,代码从 yield b 的下一条语句继续执行,而函数的本地变量看起来
和上次中断执行前是完全一样的,于是函数继续执行,直到再次遇到 yield。gensim.models.Word2Vec
官方文档
参数说明:
class gensim.models.word2vec.Word2Vec(sentences=None,size=100,alpha=0.025,window=5, min_count=5, max_vocab_size=None, sample=0.001,seed=1, workers=3,min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=,iter=5,null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000) 参数:
· sentences:可以是一个·ist,对于大语料集,建议使用BrownCorpus,Text8Corpus或·ineSentence构建。
· sg: 用于设置训练算法,默认为0,对应CBOW算法;sg=1则采用skip-gram算法。
· size:是指特征向量的维度,默认为100。大的size需要更多的训练数据,但是效果会更好. 推荐值为几十到几百。
· window:表示当前词与预测词在一个句子中的最大距离是多少
· alpha: 是学习速率
· seed:用于随机数发生器。与初始化词向量有关。
· min_count: 可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5
· max_vocab_size: 设置词向量构建期间的RAM限制。如果所有独立单词个数超过这个,则就消除掉其中最不频繁的一个。每一千万个单词需要大约1GB的RAM。设置成None则没有限制。
· sample: 高频词汇的随机降采样的配置阈值,默认为1e-3,范围是(0,1e-5)
· workers参数控制训练的并行数。
· hs: 如果为1则会采用hierarchica·softmax技巧。如果设置为0(defau·t),则negative sampling会被使用。
· negative: 如果>0,则会采用negativesamp·ing,用于设置多少个noise words
· cbow_mean: 如果为0,则采用上下文词向量的和,如果为1(defau·t)则采用均值。只有使用CBOW的时候才起作用。
· hashfxn: hash函数来初始化权重。默认使用python的hash函数
· iter: 迭代次数,默认为5
· trim_rule: 用于设置词汇表的整理规则,指定那些单词要留下,哪些要被删除。可以设置为None(min_count会被使用)或者一个接受()并返回RU·E_DISCARD,uti·s.RU·E_KEEP或者uti·s.RU·E_DEFAU·T的函数。
· sorted_vocab: 如果为1(defau·t),则在分配word index 的时候会先对单词基于频率降序排序。
· batch_words:每一批的传递给线程的单词的数量,默认为10000model.most_similar(word, topn=top):
Python中的一些关键字
call: 将类类实例可以变成一个可调用对象
class Person(object): def __init__(self, name, gender): self.name = name self.gender = gender def __call__(self, friend): print 'My name is %s...' % self.name print 'My friend is %s...' % friend
>>> p = Person('Bob', 'male')
>>> p('Tim')
My name is Bob...
My friend is Tim...
[__slots__](http://www.cnblogs.com/superxuezhazha/p/5793458.html): __slots__是指一个类允许的属性列表
class Student(object):
__slots__ = ('name', 'gender', 'score')
def __init__(self, name, gender, score):
self.name = name
self.gender = gender
self.score = score
[@property](http://www.cnblogs.com/superxuezhazha/p/5793450.html):Python装饰器
函数列表
```python
readInputAndEval(tests, outfile, hid, max_epochs, “tanh”, drop, “most”, str(i), modelt, w2v, acc_thresh=1)""" hidden_size = [100] #[50, 55, 60] #acc_tresh = 1.0 max_epochs = 8 w2v = "big" #small modeltype = ["bicond"] stopwords = ["most"] dropout = ["true"] testsetting = ["weaklySup"] pretrain = ["pre_cont"] outfile = "../out/results_batch70_2_morehash3_ep7_9-1e-3-" + tests + "_" + modelt + "_w2v" + w2v + "_hidd" + str(hid) + "_drop" + drop + "_" + pre + "_" + str(i) + ".txt" readInputAndEval("weaklySup", outfile, 100, 8, "tanh", drop = true , "most", str(i), modelt = "bicond" , w2v = big, acc_thresh=1) """
def readInputAndEval(testSetting, outfile, hidden_size, max_epochs, tanhOrSoftmax, dropout, stopwords=”most”, testid=”test1”, modeltype=”bicond”, word2vecmodel=”small”, postprocess=True, shortenTargets=False, useAutoTrump=False, useClinton=True, acc_thresh=1.0, pretrain=”pre_cont”, usePhrases=False):
""" Reading input files, calling the trainer for training the model, evaluate with official script :param outfile: name for output file :param stopwords: how to filter stopwords, see preprocess.filterStopwords() :param postprocess: force against/favor for tweets which contain the target :param shortenTargets: shorten the target text, see preprocess.transform_targets() :param useAutoTrump: use automatically annotated Trump tweets, experimental, not helping at the moment :param useClinton: add the Hillary Clinton dev data to train data :param testSetting: evaluate on Trump """
test_trainer(“weaklySup”, w2vmodel, transformed_tweets , transformed_targets, transformed_labels, ids, transformed_tweets_test,
transformed_targets_test, transformed_labels_test, ids_test, hidden_size, max_epochs, tanhOrSoftmax, dropout, modeltype, targetInTweet, testid, acc_thresh=acc_thresh, pretrain=pretrain)
transformed_tweets –19维
transformed_targets –19维 Donald trump
transformed_labels –3维 one-hot
ids : 0-18759
transformed_tweets_test –19维 707条
transformed_targets_test –19维 Donald trump 707条
transformed_labels_test–3维 one-hot
ids_test 20000- 20707
hidden_size –100
max_epochs = 8
tanhOrSoftmax –tanh
dropout =true
modeltype = bicond
targetInTweet id->true or false
testid = “test1”
acc_thresh = 0.9
pretrain = pre_cont
def test_trainer(testsetting, w2vmodel, tweets, targets, labels, ids, tweets_test, targets_test, labels_test, ids_test, hidden_size, max_epochs, tanhOrSoftmax, dropout, modeltype="conditional", targetInTweet={}, testid = "test-1", pretrain = "pre_cont", acc_thresh=0.9, sep = False):
"""
Method for creating the different models and training them
:param testsetting: "True" for SemEval test setting (Donald Trump), "False" for dev setting (Hillary Clinton)
:param w2vmodel: location of word2vec model
:param tweets: training tweets, read and converted in readInputAndEval()
:param targets: training targets, read and converted in readInputAndEval()
:param labels: training labels, read and converted in readInputAndEval()
:param ids: ids of training instances
:param tweets_test: testing tweets, read and converted in readInputAndEval()
:param targets_test: testing targets, read and converted in readInputAndEval()
:param labels_test: testing labels, read and converted in readInputAndEval()
:param ids_test: ids of testing instances
:param hidden_size: size of hidden layer
:param max_epochs: maximum number of training epochs
:param tanhOrSoftmax: tanh or softmax in projector
:param dropout: use dropout or not
:param modeltype: "concat", "tweetonly", "conditional", "conditional-reverse", "bicond", "conditional-target-feed", "bicond-sepembed"
:param targetInTweet: dictionary produced with id to targetInTweet mappings in readInputAndEval(), used for postprocessing
:param testid: id of test run
:param pretrain: "pre" (use pretrained word embeddings), "pre_cont" (use pretrained word embeddings and continue training them), "random" (random word embeddings initialisations)
:param acc_thresh: experimental, stop training at certain accuracy threshold (between 0 and 1)
:param sep: True for using separate embeddings matrices, false for one (default)
:return:
"""
model, placeholders = get_model_bidirectional_conditioning(batch_size, max_seq_length, input_size, hidden_size, target_size,
vocab_size, pretrain, tanhOrSoftmax, dropout)
batch_size,=70
max_seq_length = 19
input_size = 100
hidden_size = 100
target_size = 3
vocab_size = 47859
pretrain = "pre_cont"
tanhOrSoftmax = "hanh"
dropout = true
def get_model_bidirectional_conditioning(batch_size, max_seq_length, input_size, hidden_size, target_size,
vocab_size, pretrain, tanhOrSoftmax, dropout):
"""
Bidirectional conditioning model
:param pretrain: "pre": use pretrained word embeddings, "pre-cont": use pre-trained embeddings and continue training them, otherwise: random initialisation
"""
Encoder(rnn_cell.BasicLSTMCell, input_size, hidden_size, drop_prob, drop_prob)
get_model_conditional(batch_size, max_seq_length, input_size, hidden_size, target_size,
vocab_size, pretrain, tanhOrSoftmax, dropout):
"""
Unidirectional conditional encoding model
:param pretrain: "pre": use pretrained word embeddings, "pre-cont": use pre-trained embeddings and continue training them, otherwise: random initialisation
"""
```