# _*_ coding: gbk _*_import urllibimport urllib2import reclass Spider: def getImage(self,html): request=urllib2.Request(html); page=urllib2.urlopen(html); html=page.read(); pattern=r'src="http:.*\.jpg' imglist=re.findall(pattern,html); cnt=0 for i in imglist: print i[5:]; urllib.urlretrieve(i[5:], 'E:\\images\%s.jpg' % cnt); cnt+=1 if cnt==2: break; print 'the end' if __name__=="__main__": print 'hello' s=Spider(); #html=r"http://baike.baidu.com/link?url=pj6QaA2Zyrxx2WcD4f7vN50LWVIZjJUKYdnnLGMOWnmInlALGH4dXmU86hE3Ar-jmaiahjf2MiEZ3n_0WCOUlFuKwVfYZNKnBwxidD1cC3i"; html=r"http://baike.baidu.com/link?url=rHaKx7RPBWuR4MxzY0BPhwbLKH4DEdwKPN8EYH-78Zzm7IMUuFTYM0eUZw-j27lHxDxyyNiqkjUg4JG8FvyjNUsuqiTzLixsNSXUtTWiOpQqrtxbf4hkj-n6gF1Nyn4D" s.getImage(html);
python从某个网站上面爬很多图片的url,主要是从百度风云榜上面爬的,男演员,女演员,男歌手,女歌手,总共200张
# _*_ coding: gbk _*_import urllibimport urllib2import reimport osclass Spider: def getImage(self,html): request=urllib2.Request(html); page=urllib2.urlopen(html); html=page.read(); pattern=r'href="http:.*简介' imglist=re.findall(pattern,html); with open(r'e:\\images\\paths.txt','w+') as f: for i in imglist: print i[6:len(i)-6]; f.write(i[6:len(i)-6]); f.write('\n') print len(imglist) print 'the end' if __name__=="__main__": print 'hello' s=Spider(); #html=r"http://baike.baidu.com/link?url=pj6QaA2Zyrxx2WcD4f7vN50LWVIZjJUKYdnnLGMOWnmInlALGH4dXmU86hE3Ar-jmaiahjf2MiEZ3n_0WCOUlFuKwVfYZNKnBwxidD1cC3i"; html=r'http://top.baidu.com/buzz?b=18&qq-pf-to=pcqq.group' s.getImage(html);