Python语言程序设计MOOC-06周文件与字典

2020-06-21

python

在MOOC上，学习了北京理工大学嵩天老师的Python语言程序设计，此为学习笔记记录，备查，备翻阅复习。

常用的编码

ASCII码是标准化字符集，由7个二进制位编码，表示128个字符。ord(), chr()内置函数
Unicode码跨语言跨平台统一且唯一的二进制编码，每个字符两个字节长
UTF-8 是可变长度的Unicode，中文对于Unicode的三字节
GBK编码是专门针对汉字的编码是双字节编码

Python中的字符串编码是未编码的。encode() ， decode()

文件数据

文本文件是基于字符定长的ASCII码文件。方便阅读和理解，
二进制文件，除文本文件以外的其他文件。照片音乐视频计算机程序等。更加节省空间。

文件的基本处理

打开文件：建立计算机存储文件与程序的关联。open(fname, mode)即创建一个文件对象
文件操作：读取写入定位追加计算
关闭文件切断文件与程序的联系

文件打开模式：

r 只读。如果文件不存在，则输出错误
w 只写。如果文件不存在，则自动创建文件
a 附加到文件末尾。
rb 只读二进制文件，如果文件不存在，则输出错误。
wb 只写二进制文件，如果文件不存在，则自动创建文件。
ab 表示附加到二进制文件末尾。
r+ 读写

# 打开一个文本文件
>>>infile = open("numbers.dat","r")
# 打开一个音频文件
>>>infile = open("music.mp3","rb")

文件读取

read()：返回值是包含整个文件内容的一个字符串

readline()：返回值是文件下一行内容的字符串

readlines()：返回值是整个文件内容的列表，每项是以换行符为结尾的一行字符串。

# read()函数
def main():
    fname = input("Enter filename:")
    infile = open(fname,"r")
    data = infile.read()
    print(data)
main()

# readline()函数，示例打印文件的前五行
infile = open(someFile,"r")
for i in range(5):
    line = infile.readline()
    print(line[:-1])#去掉每行最后的分隔符

文件写入

write()：把含有文本数据或二进制数据块的字符串写入文件中
writelines()：针对列表操作，接受一个字符串列表作为参数，将它们写入文件，并且行结束符不会被自动加入。

>>>outfile = open("outfile.txt","w")# 如果不存在会自动创建，如果存在会删除之并创建一个新的空文件
>>>outfile.writelines(["Hello"," ","world"])
>>>outfile.close()
>>>infile = open("outfile.txt","r")
>>>infile.read()
"Hello world"

文件遍历

遍历文件模板

遍历文件修改文件内容

# 通用代码框架
file = open(someFile, "r")
for line in file.readlines():
    #处理一行文件内容
file.close()

# 简化代码框架
file = open(someFile, "r")
for line in file:
    #处理一行文件内容
file.close()

文件内容拷贝的代码示例

def main():
    #用户输入文件名
    f1 = input("Enter a source file:").strip()
    f2 = input("Enter a source file:").strip()
    
    #打开文件
    infile = open(f1,"r") # 源文件
    outfile = open(f2,"w") # 拷贝的目标文件
    
    #拷贝数据
    countlines = countChars = 0
    for line in infile:
        countLines +=1
        countChars +=len(line)
        outfile.write(line)#可以控制拷贝部分内容
    print(countLines,"lines and ", countChars, "chars copied")
    
    infile.close()
    outfile.close()
    
main()

文件实例1

#利用字符串和列表将两个通讯录文本合并为一个文本
def main():
    ftele1=open('TeleAddressBook.txt','rb')
    ftele2=open('EmailAddressBook.txt','rb')
 
    ftele1.readline()#跳过第一行
    ftele2.readline()
    lines1 = ftele1.readlines()
    lines2 = ftele2.readlines()
 
    list1_name = []  
    list1_tele = []
    list2_name = []  
    list2_email = []
 
    for line in lines1:#获取第一个文本中的姓名和电话信息
        elements = line.split()
        list1_name.append(str(elements[0].decode('gbk')))
        list1_tele.append(str(elements[1].decode('gbk')))    #将文本读出来的bytes转换为str类型，可以保证中文字符处理不会出错
 
    for line in lines2:#获取第二个文本中的姓名和邮件信息
        elements = line.split()
        list2_name.append(str(elements[0].decode('gbk')))
        list2_email.append(str(elements[1].decode('gbk')))
 
    ###开始处理###
    lines = []
    lines.append('姓名\t    电话   \t  邮箱\n')
 
    #按索引方式遍历姓名列表1
    for i in range(len(list1_name)): 
        s= ''
        if list1_name[i] in list2_name:
                j = list2_name.index(list1_name[i]) #找到姓名列表1对应列表2中的姓名索引位置
                s = '\t'.join([list1_name[i], list1_tele[i], list2_email[j]])
                s += '\n'
        else:
                s = '\t'.join([list1_name[i], list1_tele[i], str('   -----   ')])
                s += '\n'
        lines.append(s)
         
    #处理姓名列表2中剩余的姓名        
    for i in range(len(list2_name)): 
        s= ''
        if list2_name[i] not in list1_name:
                s = '\t'.join([list2_name[i], str('   -----   '), list2_email[i]])
                s += '\n'
        lines.append(s)  
 
    ftele3 = open('AddressBook.txt', 'w')
    ftele3.writelines(lines)
    ftele3.close()
    ftele1.close()
    ftele2.close()
 
    print("The addressBooks are merged!")
 
if __name__ == "__main__":
    main()

字典的基础

针对非序列集合而提供的一种数据类型。因为有时候查询数据并不知道索引信息，所以采用键值对。

字典是一个键值对的集合，实现了映射，键为索引。

python内部对字典元素的存储做了优化，其顺序并不是当初创建字典元素的顺序。

字典的操作

{} 创建字典

然后通过中括号[key] = value为字典增加新元素

删除字典的一项 del dicName[key]

字典的遍历：遍历key, value, item， key:value

in 或者 not in

字典的标准操作符: -,<,>,<=, >=,==,!=, and, or, not

字典方法

keys()

values()

items()

clear()

get(key)

pop(key)

update(key)

字典实例1

统计词频的问题，这里是统一英文文章。该例子是比较实用的。

英文文章以空格或者标点符号分割单词。

import turtle
 
##全局变量##
#词频排列显示个数
count = 10
#单词频率数组-作为y轴数据
data = []
#单词数组-作为x轴数据
words = []
#y轴显示放大倍数-可以根据词频数量进行调节
yScale = 6
#x轴显示放大倍数-可以根据count数量进行调节
xScale = 30
 
################# Turtle Start  ####################  
#从点(x1,y1)到(x2,y2)绘制线段
def drawLine(t, x1, y1, x2, y2):
    t.penup()
    t.goto (x1, y1)
    t.pendown()
    t.goto (x2, y2)
 
# 在坐标(x,y)处写文字
def drawText(t, x, y, text):
    t.penup()
    t.goto (x, y)
    t.pendown()
    t.write(text)
 
def drawGraph(t):
    #绘制x/y轴线
    drawLine (t, 0, 0, 360, 0)
    drawLine (t, 0, 300, 0, 0)
 
    #x轴: 坐标及描述
    for x in range(count):
        x=x+1 #向右移一位,为了不画在原点上
        drawText(t, x*xScale-4, -20, (words[x-1]))
        drawText(t, x*xScale-4, data[x-1]*yScale+10, data[x-1])
    drawBar(t)
 
#绘制一个柱体
def drawRectangle(t, x, y):
    x = x*xScale
    y = y*yScale#放大倍数显示
    drawLine(t, x-5, 0, x-5, y)
    drawLine(t, x-5, y, x+5, y)
    drawLine(t, x+5, y, x+5, 0)
    drawLine(t, x+5, 0, x-5, 0)
     
#绘制多个柱体
def drawBar(t):
    for i in range(count):
        drawRectangle(t, i+1, data[i])    
################# Turtle End  ####################
 
         
#对文本的每一行计算词频的函数
def processLine(line, wordCounts):
    #用空格替换标点符号
    line = replacePunctuations(line)
    #从每一行获取每个词
    words = line.split() 
    for word in words:
        if word in wordCounts:
            wordCounts[word] += 1
        else:
            wordCounts[word] = 1
 
#空格替换标点的函数
def replacePunctuations(line):
    for ch in line:
        if ch in "~@#$%^&*()_-+=<>?/,.:;{}[]|\'""":
            line = line.replace(ch, " ")
    return line
 
def main():
    #用户输入一个文件名
    filename = input("enter a filename:").strip()
    infile = open(filename, "r")
     
    #建立用于计算词频的空字典
    wordCounts = {}
    for line in infile:
        processLine(line.lower(), wordCounts)
         
    #从字典中获取数据对
    pairs = list(wordCounts.items())
 
    #列表中的数据对交换位置,数据对排序
    items = [[x,y]for (y,x)in pairs] 
    items.sort() 
 
    #输出count个数词频结果
    for i in range(len(items)-1, len(items)-count-1, -1):
        print(items[i][1]+"\t"+str(items[i][0]))
        data.append(items[i][0])
        words.append(items[i][1])
         
    infile.close()
     
    #根据词频结果绘制柱状图
    turtle.title('词频结果柱状图')
    turtle.setup(900, 750, 0, 0)
    t = turtle.Turtle()
    t.hideturtle()
    t.width(3)
    drawGraph(t)
         
#调用main()函数
if __name__ == '__main__':
    main()

字典实例2

通过字典的数据操作方式，可以更简单地解决上述的一个合并姓名电话邮箱地址簿的任务

#利用字典将两个通讯录文本合并为一个文本
def main():
        ftele2=open('TeleAddressBook.txt','rb')
        ftele1=open('EmailAddressBook.txt','rb')
 
        ftele1.readline()#跳过第一行
        ftele2.readline()
        lines1 = ftele1.readlines()
        lines2 = ftele2.readlines()
 
        dic1 = {}   #字典方式保存
        dic2 = {}
 
 
        for line in lines1:#获取第一个本文中的姓名和电话信息
                elements = line.split()
                #将文本读出来的bytes转换为str类型
                dic1[elements[0]] = str(elements[1].decode('gbk'))
                 
        for line in lines2:#获取第二个本文中的姓名和电话信息
                elements = line.split()
                dic2[elements[0]] = str(elements[1].decode('gbk'))
 
        ###开始处理###
        lines = []
        lines.append('姓名\t    电话   \t  邮箱\n')
 
        for key in dic1:
            s= ''
            if key in dic2.keys():
                    s = '\t'.join([str(key.decode('gbk')), dic1[key], dic2[key]])
                    s += '\n'
            else:
                    s = '\t'.join([str(key.decode('gbk')), dic1[key], str('   -----   ')])
                    s += '\n'
            lines.append(s)
             
        for key in dic2:
            s= ''
            if key not in dic1.keys():
                    s = '\t'.join([str(key.decode('gbk')), str('   -----   '), dic2[key]])
                    s += '\n'       
            lines.append(s)
 
        ftele3 = open('AddressBook.txt', 'w')
        ftele3.writelines(lines)
 
        ftele3.close()
        ftele1.close()
        ftele2.close()
        print("The addressBooks are merged!")
 
if __name__ == "__main__":
        main()

Python语言程序设计MOOC-06周 文件与字典