一个Python和C性能对比的好例子

时间：2006-06-20 来源：me09

VCKbase的论坛上有人提了一个问题：
遇到的问题是，我把E-mail地址收集在txt文件中（每行一个E-mail地址，共78000行），要求是把其中重复的E-mail地址全部去掉。考虑编程方便，我用了CString类，但似乎效率不高。我写出来的整理78000份E-mail地址的需要近7分钟的时间

我给他用Python实现了一个(这个版本是修改过的，当时我随手实现的一个版本做错了，呵呵)
这里算法的主要内容就是利用hash_table来判断email是否已经重复出现，这比字符串的查找要快的多。

#remove duplicated email address from file
import datetime

if __name__ == "__main__":
 t = datetime.datetime(2000,1,1)
 print str(t.today())
 hashtable = {}
 f = file("email.txt","r")
 f2 = file("email_new.txt","w")
 line = f.readline();
 while len(line)>0:
 if not hashtable.has_key(line): 
 hashtable[line] = 1
 f2.write(line)
 line = f.readline();
 f.close()
 f2.close()
 t2 = datetime.datetime(2000,1,1)
 print str(t2.today())

上面的算法在我的机器上：P4 2.8G,512M 内存，耗时大约300ms，跟他的7分钟差了天了。而我当时错误的算法，需要2分半钟，所以我贴到我们内部的BBS上，我的一个同学实现了如下算法(C):

#include 
#include 
#include 
#include "list.h"

#define HASH_SIZE 262157
#define LINE_MAX_LEN 1024

struct list_node {
 struct list_head head;
 int len;
 char *str;
};

struct list_head hash_table[HASH_SIZE];

static inline int
hash_string (const char *ptr, int len)
{
 unsigned int hash = 0;

for (hash = 0; len; len--, ptr++) {
 /* (31 * hash) will probably be optimized to ((hash << 5) - hash). */
 hash = 31 * hash + *ptr;
 }

return (hash % HASH_SIZE);
}

static inline int
cmpfn(const struct list_node *node, int len, char *str)
{
 return ((node->len == len) && (memcmp(node->str, str, len) == 0));
}

int main(int argc, char **argv)
{
 int i, hash, len;
 struct list_node *node;
 char buf[LINE_MAX_LEN];
 FILE *fp_in, *fp_out;

if(argc != 3) {
 printf("error: \nplease use eml as:\n eml infile outfile\n");
 return -1;
 }
 fp_in = fopen(argv[1], "r");
 if(!fp_in) {
 printf("error: could not open infile %s\n", argv[1]);
 return -1;
 }
 fp_out = fopen(argv[2], "w");
 if(!fp_out) {
 printf("error: could not open infile %s\n", argv[2]);
 return -1;
 }
 for(i=0; istr = (char *)malloc(len + 1);
 node->len = len;
 memcpy(node->str, buf, len);
 node->str[len] = '\0';
 list_append(&(hash_table[hash]), node);
 }
 }

return 0;
}

还有.h文件：

#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H

extern inline void prefetch(const void *x)
{
 return;
}
/*
 * Simple doubly linked list implementation.
 *
 * Some of the internal functions ("__xxx") are useful when
 * manipulating whole lists rather than single entries, as
 * sometimes we already know the next/prev entries and we can
 * generate better code by using them directly rather than
 * using the generic single-entry routines.
 */

struct list_head {
 struct list_head *next, *prev;
};

#define LIST_HEAD_INIT(name) { &(name), &(name) }

#define LIST_HEAD(name) \
 struct list_head name = LIST_HEAD_INIT(name)

#define INIT_LIST_HEAD(ptr) do { \
 (ptr)->next = (ptr); (ptr)->prev = (ptr); \
} while (0)

/*
 * Insert a new entry between two known consecutive entries. 
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void
__list_add(struct list_head *new,
 struct list_head *prev, struct list_head *next)
{
 next->prev = new;
 new->next = next;
 new->prev = prev;
 prev->next = new;
}

/**
 * list_add - add a new entry
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void
list_add(struct list_head *new, struct list_head *head)
{
 __list_add(new, head, head->next);
}

/**
 * list_add_tail - add a new entry
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static inline void
list_add_tail(struct list_head *new, struct list_head *head)
{
 __list_add(new, head->prev, head);
}

/*
 * Delete a list entry by making the prev/next entries
 * point to each other.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void
__list_del(struct list_head *prev, struct list_head *next)
{
 next->prev = prev;
 prev->next = next;
}

/**
 * list_del - deletes entry from list.
 * @entry: the element to delete from the list.
 * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
 */
static inline void
list_del(struct list_head *entry)
{
 __list_del(entry->prev, entry->next);
 entry->next = (void *) 0;
 entry->prev = (void *) 0;
}

/**
 * list_del_init - deletes entry from list and reinitialize it.
 * @entry: the element to delete from the list.
 */
static inline void
list_del_init(struct list_head *entry)
{
 __list_del(entry->prev, entry->next);
 INIT_LIST_HEAD(entry);
}

/**
 * list_move - delete from one list and add as another's head
 * @list: the entry to move
 * @head: the head that will precede our entry
 */
static inline void
list_move(struct list_head *list, struct list_head *head)
{
 __list_del(list->prev, list->next);
 list_add(list, head);
}

/**
 * list_move_tail - delete from one list and add as another's tail
 * @list: the entry to move
 * @head: the head that will follow our entry
 */
static inline void
list_move_tail(struct list_head *list, struct list_head *head)
{
 __list_del(list->prev, list->next);
 list_add_tail(list, head);
}

/**
 * list_empty - tests whether a list is empty
 * @head: the list to test.
 */
static inline int
list_empty(struct list_head *head)
{
 return head->next == head;
}

static inline void
__list_splice(struct list_head *list, struct list_head *head)
{
 struct list_head *first = list->next;
 struct list_head *last = list->prev;
 struct list_head *at = head->next;

first->prev = head;
 head->next = first;

last->next = at;
 at->prev = last;
}

/**
 * list_splice - join two lists
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 */
static inline void
list_splice(struct list_head *list, struct list_head *head)
{
 if (!list_empty(list))
 __list_splice(list, head);
}

/**
 * list_splice_init - join two lists and reinitialise the emptied list.
 * @list: the new list to add.
 * @head: the place to add it in the first list.
 *
 * The list at @list is reinitialised
 */
static inline void
list_splice_init(struct list_head *list, struct list_head *head)
{
 if (!list_empty(list)) {
 __list_splice(list, head);
 INIT_LIST_HEAD(list);
 }
}

/**
 * list_entry - get the struct for this entry
 * @ptr: the &struct list_head pointer.
 * @type: the type of the struct this is embedded in.
 * @member: the name of the list_struct within the struct.
 */
#define list_entry(ptr, type, member) \
 ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))

/**
 * list_for_each - iterate over a list
 * @pos: the &struct list_head to use as a loop counter.
 * @head: the head for your list.
 */
#define list_for_each(pos, head) \
 for (pos = (head)->next, prefetch(pos->next); pos != (head); \
 pos = pos->next, prefetch(pos->next))

/**
 * list_for_each_safe - iterate over a list safe against removal of list entry
 * @pos: the &struct list_head to use as a loop counter.
 * @n: another &struct list_head to use as temporary storage
 * @head: the head for your list.
 */
#define list_for_each_safe(pos, n, head) \
 for (pos = (head)->next, n = pos->next; pos != (head); \
 pos = n, n = pos->next)

/**
 * list_for_each_entry - iterate over list of given type
 * @pos: the type * to use as a loop counter.
 * @head: the head for your list.
 * @member: the name of the list_struct within the struct.
 */
#define list_for_each_entry(pos, head, member) \
 for (pos = list_entry((head)->next, typeof(*pos), member), \
 prefetch(pos->member.next); \
 &pos->member != (head); \
 pos = list_entry(pos->member.next, typeof(*pos), member), \
 prefetch(pos->member.next))

/*this is the begin of listhelp ;-)*/

#define LIST_FIND(head, cmpfn, type, args...) \
({ \
 const struct list_head *__i = (head); \
 \
 do { \
 __i = __i->next; \
 if (__i == (head)) { \
 __i = NULL; \
 break; \
 } \
 } while (!cmpfn((const type)__i , ## args)); \
 (type)__i; \
})

#define LIST_FIND_W(head, cmpfn, type, args...) \
({ \
 const struct list_head *__i = (head); \
 \
 do { \
 __i = __i->next; \
 if (__i == (head)) { \
 __i = NULL; \
 break; \
 } \
 } while (!cmpfn((type)__i , ## args)); \
 (type)__i; \
})

static inline int
__list_cmp_same(const void *p1, const void *p2)
{
 return p1 == p2;
}

/* Is this entry in the list? */
static inline int
list_inlist(struct list_head *head, const void *entry)
{
 return LIST_FIND(head, __list_cmp_same, void *, entry) !=NULL;
}

/* Delete from list. */
#ifdef CONFIG_NETFILTER_DEBUG
#define LIST_DELETE(head, oldentry) \
do { \
 if (!list_inlist(head, oldentry)) \
 printk("LIST_DELETE: %s:%u `%s'(%p) not in %s.\n", \
 __FILE__, __LINE__, #oldentry, oldentry, #head); \
 else list_del((struct list_head *)oldentry); \
} while(0)
#else
#define LIST_DELETE(head, oldentry) list_del((struct list_head *)oldentry)
#endif

/* Append. */
static inline void
list_append(struct list_head *head, void *new)
{
 list_add((new), (head)->prev);
}

/* Prepend. */
static inline void
list_prepend(struct list_head *head, void *new)
{
 list_add(new, head);
}

/* Insert according to ordering function; insert before first true. */
#define LIST_INSERT(head, new, cmpfn) \
do { \
 struct list_head *__i; \
 for (__i = (head)->next; \
 !cmpfn((new), (typeof (new))__i) && __i != (head); \
 __i = __i->next); \
 list_add((struct list_head *)(new), __i->prev); \
} while(0)

/* If the field after the list_head is a nul-terminated string, you
 can use these functions. */
static inline int
__list_cmp_name(const void *i, const char *name)
{
 return strcmp(name, i + sizeof (struct list_head)) == 0;
}

static inline int
__list_cmp_nameptr(const void *i, const char *name)
{
 return strcmp(name, ((char *)
 *(unsigned long *) (i +
 sizeof (struct list_head)))) ==
 0;
}

/* Returns false if same name already in list, otherwise does insert. */
static inline int
list_named_insert(struct list_head *head, void *new)
{
 if (LIST_FIND(head, __list_cmp_name, void *,
 new + sizeof (struct list_head)))
 return 0;
 list_prepend(head, new);
 return 1;
}

static inline int
list_nameptr_insert(struct list_head *head, void *new)
{
 if (LIST_FIND(head, __list_cmp_nameptr, void *,
 new + sizeof (struct list_head)))
 return 0;
 list_prepend(head, new);
 return 1;
}

/* Find this named element in the list. */
#define list_named_find(head, name) \
LIST_FIND(head, __list_cmp_name, void *, name)

#define list_nameptr_find(head, name) \
LIST_FIND(head, __list_cmp_nameptr, void *, name)

#endif

在我的机器上测试(cygwin)：耗时竟然是400ms!
当然，这里包含了Cygwin的间接调用开销。

很难想象吧，Python在这里的性能竟然跟C不相上下！而且Python的代码比C不知道短了多少倍！
所以讲，Python是蛮好用的，:-).
不过，这个例子其实是个特殊情况，以前我们玩过的另外一个程序，C的性能就比Python快得多。不过代码量还是Python简短得多。有兴趣的朋友可以用自己熟悉的语言来测试一下，下面是测试文件(也是我用11行python代码产生的,有1/10的email是重复的)：
这其实是一个rar,下载后请修改扩展名