代码如下:
Python客栈送红包、纸质书
#coding=gbk
importstring
importcodecs
importos,time
importxlwt
importxlrd
frombs4importBeautifulSoup
fromxlrdimportopen_workbook
classLogMsg:
def__init__(self,logfile,Level=0):
try:
importlogging
#self.logger=None
self.logger=logging.getLogger()
self.hdlr=logging.FileHandler(logfile)
formatter=logging.Formatter("[%(asctime)s]:%(message)s","%Y%m%d%H:%M:%S")
self.hdlr.setFormatter(formatter)
self.logger.addHandler(self.hdlr)
#logger.setLevel()
ifLevel==10:
self.logger.setLevel(logging.DEBUG)
elifLevel==20:
self.logger.setLevel(logging.INFO)
elifLevel==30:
self.logger.setLevel(logging.WARNING)
elifLevel==40:
self.logger.setLevel(logging.ERROR)
elifLevel==50:
self.logger.setLevel(logging.CRITICAL)
else:
self.logger.setLevel(logging.NOTSET)
except:
print"loginiterror!"
exit(1)
defoutput(self,logInfo):
Level=self.logger.getEffectiveLevel()
try:
ifLevel==10:
self.logger.debug(logInfo)
elifLevel==20:
self.logger.info(logInfo)
elifLevel==30:
self.logger.warning(logInfo)
elifLevel==40:
self.logger.error(logInfo)
elifLevel==50:
self.logger.critical(logInfo)
else:
self.logger.info(logInfo)
except:
print"logoutputerror!"
exit(1)
defclose(self):
try:
#logging.shutdown([self.hdlr])
self.logger.removeHandler(self.hdlr)
except:
print"logclosederror!"
exit(1)
Logtime=time.strftime("%Y%m%d%H%M%S",time.localtime())
logFileTime=time.strftime("%Y%m%d",time.localtime())
Logfile='/data/pyExample/logs/htmlparser_%s.log'%logFileTime
log=LogMsg(Logfile,20)
DATAPATH='/data/pyExample/'
XLSname='dangjian_'+Logtime+'.xls'
if__name__=='__main__':
wbk=xlwt.Workbook(encoding='gbk')
sheet=wbk.add_sheet('基本内容导入模板')
sheet.write(0,0,'内容类型')
sheet.write(0,1,'栏目名称')
sheet.write(0,2,'栏目编号')
sheet.write(0,3,'内容名称')
sheet.write(0,4,'时长')
sheet.write(0,5,'关键字')
sheet.write(0,6,'看点')
sheet.write(0,7,'作者')
sheet.write(0,8,'来源')
sheet.write(0,9,'子内容1')
sheet.write(0,10,'子内容2')
xlsContent=[]
files=os.listdir(DATAPATH)
k=0
forfinfiles:
ifos.path.splitext(f)[1]=='.html':
content=[]
log.output('当前文件:'+f)
htmlFile=codecs.open(DATAPATH+f,'r','gbk')
lines=htmlFile.readlines()
ifnotlines:
log.output('notline')
forlineinlines:
ifline.strip()=='\n':
log.output('该处是空行')
else:
line=line.replace(' ','')
soup=BeautifulSoup(line)
fortddinsoup.findAll('td'):
#printtdd.text.encode("gbk")
content.append(tdd.text.encode("gbk"))
#printline.encode('gbk')
htmlFile.close()
foriincontent:
printcontent.index(i),',',i
log.output(i)
log.output(content.index(i))
print'----------------------------------------'
folderName=content[6]
contentName=content[4]
duration=filter(str.isdigit,content[16])
int_duration=string.atoi(duration)*60
str_duration="%i"%int_duration
keyWord=content[6]
desciption=content[36]
videoName_1=content[10]
printfolderName
printcontentName
printstr_duration
printkeyWord
printdesciption
printvideoName_1
log.output('输出xls数据:'+','+folderName+',,'+contentName+','+str_duration+','+keyWord+','+desciption+',管理员,华数编辑,'+videoName_1+',,')
printk
sheet.write(k+1,0,'')
sheet.write(k+1,1,folderName)
sheet.write(k+1,2,'')
sheet.write(k+1,3,contentName)
sheet.write(k+1,4,str_duration)
sheet.write(k+1,5,keyWord)
sheet.write(k+1,6,desciption)
sheet.write(k+1,7,'管理员')
sheet.write(k+1,8,'华数编辑')
sheet.write(k+1,9,videoName_1)
sheet.write(k+1,10,'')
k+=1
wbk.save(DATAPATH+XLSname)
print'========================================='
|