使用itertools.groupby性能进行numpy分组

我有很多大的(>

35,000,000)整数列表,其中将包含重复项。我需要获取列表中每个整数的计数。以下代码有效,但似乎很慢。还有人可以使用Python最好是Numpy更好地进行基准测试吗?

def group():

import numpy as np

from itertools import groupby

values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')

values.sort()

groups = ((k,len(list(g))) for k,g in groupby(values))

index = np.fromiter(groups,dtype='u4,u2')

if __name__=='__main__':

from timeit import Timer

t = Timer("group()","from __main__ import group")

print t.timeit(number=1)

返回:

$ python bench.py 

111.377498865

干杯!

def group_original():

import numpy as np

from itertools import groupby

values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')

values.sort()

groups = ((k,len(list(g))) for k,g in groupby(values))

index = np.fromiter(groups,dtype='u4,u2')

def group_gnibbler():

import numpy as np

from itertools import groupby

values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')

values.sort()

groups = ((k,sum(1 for i in g)) for k,g in groupby(values))

index = np.fromiter(groups,dtype='u4,u2')

def group_christophe():

import numpy as np

values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')

values.sort()

counts=values.searchsorted(values, side='right') - values.searchsorted(values, side='left')

index = np.zeros(len(values),dtype='u4,u2')

index['f0']=values

index['f1']=counts

#Erroneous result!

def group_paul():

import numpy as np

values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')

values.sort()

diff = np.concatenate(([1],np.diff(values)))

idx = np.concatenate((np.where(diff)[0],[len(values)]))

index = np.empty(len(idx)-1,dtype='u4,u2')

index['f0']=values[idx[:-1]]

index['f1']=np.diff(idx)

if __name__=='__main__':

from timeit import Timer

timings=[

("group_original","Original"),

("group_gnibbler","Gnibbler"),

("group_christophe","Christophe"),

("group_paul","Paul"),

]

for method,title in timings:

t = Timer("%s()"%method,"from __main__ import %s"%method)

print "%s: %s secs"%(title,t.timeit(number=1))

返回:

$ python bench.py 

Original: 113.385262966 secs

Gnibbler: 71.7464978695 secs

Christophe: 27.1690568924 secs

Paul: 9.06268405914 secs

尽管Christophe目前给出的结果不正确

回答:

我做类似这样的事情得到了3倍的改进:

def group():

import numpy as np

values = np.array(np.random.randint(0,3298,size=35000000),dtype='u4')

values.sort()

dif = np.ones(values.shape,values.dtype)

dif[1:] = np.diff(values)

idx = np.where(dif>0)

vals = values[idx]

count = np.diff(idx)

以上是 使用itertools.groupby性能进行numpy分组 的全部内容, 来源链接: utcz.com/qa/404279.html

回到顶部