python3 英文文章自动生成摘要

采用nltk自动生成文章摘要

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict
from string import punctuation
from heapq import nlargest

import nltk
nltk.download('stopwords')
nltk.download('punkt')


stopwords = set(stopwords.words('english') + list(punctuation))
max_cut = 0.9
min_cut = 0.1

"""
计算出每个词出现的频率
word_sent 是一个已经分好词的列表
返回一个词典freq[],
freq[w]代表了w出现的频率
"""
def compute_frequencies(word_sent):
    """
    defaultdict和普通的dict
    的区别是它可以设置default值
    参数是int默认值是0
    """
    freq = defaultdict(int)

    #统计每个词出现的频率
    for s in word_sent:
        for word in s:
            #注意stopwords
            if word not in stopwords:
                freq[word] += 1

    #得出最高出现频次m
    m = float(max(freq.values()))
    #所有单词的频次统除m
    for w in list(freq.keys()):
        freq[w] = freq[w]/m
        if freq[w] >= max_cut or freq[w] <= min_cut:
            del freq[w]
    # 最后返回的是
    # {key:单词, value: 重要性}
    return freq

def summarize(text, n):
    """
    用来总结的主要函数
    text是输入的文本
    n是摘要的句子个数
    返回包含摘要的列表
    """

    # 首先先把句子分出来
    sents = sent_tokenize(text)
    assert n <= len(sents)

    # 然后再分词
    word_sent = [word_tokenize(s.lower()) for s in sents]

    # freq是一个词和词重要性的字典
    freq = compute_frequencies(word_sent)
    #ranking则是句子和句子重要性的词典
    ranking = defaultdict(int)
    for i, word in enumerate(word_sent):
        for w in word:
            if w in freq:
                ranking[i] += freq[w]
    sents_idx = rank(ranking, n)
    return [sents[j] for j in sents_idx]

"""
考虑到句子比较多的情况
用遍历的方式找最大的n个数比较慢
我们这里调用heapq中的函数
创建一个最小堆来完成这个功能
返回的是最小的n个数所在的位置
"""    
def rank(ranking, n):
    return nlargest(n, ranking, key=ranking.get)


if __name__ == '__main__':
    with open("news.txt", "r") as myfile:  #news.tx文章内容
        text = myfile.read().replace('\n','')
    res = summarize(text, 3)
    for i in range(len(res)):
        print(res[i])