Posted on

I have been wondering what people do with nucleotide sequencings in blood plasma, so I decided to do a bit of data mining in pubmed database to answer this question.

I searched pubmed with the key words: 'plasma sequencing', and built a network based on the search results using the words in their titles.

The network is plotted here:

From the words network, I would say plasma sequencings have been applied on non-invasive screening in pregnancies and cancer patients.

The following is the script that I used to generate the figure.

#!/usr/bin/env python

import matplotlib 
matplotlib.use('Agg')
from Bio import Entrez, Medline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools
import networkx as nx

def parseingRcords(record, wordCombCount):
    """
    for each word in the title, paired with the other words and make it a record
    """
    title = record['TI']
    wordList = [word for word in title.split(' ') if len(word) > 3]
    for comb in itertools.combinations_with_replacement(wordList,2):
        wordCombCount.setdefault(comb,0)
        wordCombCount[comb] += 1
    return wordCombCount 


def searchPubMed(keyword, wordCombCount, articleCount, searchCount):
    """
    using entrez API for searching the articles with the key word
    """
    maxFetch = 100000
    startPoint = searchCount * maxFetch
    result = Entrez.esearch(db='pubmed',  term=keyword, retmax=maxFetch, restart = startPoint)
    ids = Entrez.read(result)['IdList']
    h = Entrez.efetch(db='pubmed', id=ids, rettype="Medline", retmode='text')
    records = Medline.parse(h)
    for record in records:
        wordCombCount = parseingRcords(record, wordCombCount)
    searchCount += 1
    print 'Finished searching %i articles' %(maxFetch + startPoint)
    return searchCount, articleCount, wordCombCount

def plotNetwork(df):
    """
    throwing pandas data frame to networkx and plot
    """
    figurename = 'titleWordingNetwork.pdf'
    graph = nx.from_pandas_dataframe(df,'word1','word2','count')
    nx.draw_networkx(graph, with_labels=True, edge_color = 'green', alpha = 0.7)
    plt.savefig(figurename)
    print 'Written file: %s' %figurename
    return 0

def main():
    keyword = 'plasma+sequencing'
    Entrez.email = 'wckdouglas@gmail.com'
    wordCombCount = {}
    articleCount = 0
    searchCount = 0
    searchTimes = 4
    for i in np.arange(searchTimes):
        searchCount, articleCount, wordCombCount = searchPubMed(keyword, wordCombCount, 
                                                articleCount, searchCount)
    df = pd.DataFrame(wordCombCount.items(), columns = ['comb','count'])
    df['word1'] = map(lambda x: x[0], df['comb'])
    df['word2'] = map(lambda x: x[1], df['comb'])
    df.drop('comb',axis=1,inplace=True)
    plotNetwork(df)
    return 0

if __name__ == '__main__':
    main()