课题:语法高亮着色转换软件--词法分析器

张炳
2023-12-01

Github地址:https://github.com/Mr-Porridge/Highlight-Spirit

请点个star后大家随便使用

如遇到问题 随时提交issue或联系本人即可 谢谢!

(一)C++高级程序设计语言的子集

分类号

分类内容

0

保留字

1

标识符

2

常量

3

运算符

4

分隔符

  • 【0】保留字:
C++保留字49个:
cin,cout,return,extern,public,template,this,

if,else,while,signed,throw,union,this,

int,char,double,float,unsigned,const,goto,

include,for,long,short,virtual,sizeof,static,string,

struct,typedef,break,auto,void,stdio,class,

try,catch,defalt,false,true,virtual,delete

friend,inline,,namespace,new,operator,private,protected

C++额外保留字3个:
endl,printf, scanf,
  • 【1】标识符:即变量
标识符={以字母或''_''开头的,包含字母、下划线、数字的字符串}
  • 【2】常量:
常量={整数、浮点数、布尔型常量、字符型常量}
  • 【3】运算符:
运算符={算术运算符、关系运算符、逻辑运算符、位运算符、赋值运算符、杂项运算符}
  • 【4】分隔符:
分隔符={'':'',  '' ; '',  '','', ''('',  '')'', ''{'',  ''}'',  ''['',  '']''}

 

(二)覆盖以上子集的正规文法

【接着补】 未完待续

 

(三)数据结构

# 词法分析精灵
class spirit:
    def __init__(self, raw_string: str):
        self.raw = raw_string  # 初始化原始文件数据
        # self.de_space()
        # 现在使用直接复制方法初始化 之后使用json进行初始化【之后需要改进】
        self.keywords = []
        self.init_keywords()  # 初始化关键字
        self.furnace = ''  # 词法分析容器
        self.letters = list(string.ascii_letters)  # 26*2个字母
        self.separators = [',', ';', '(', ')', '{', '}', '[', ']']  # 分隔符
        self.underline = '_'  # 下划线 用于处理标识符 _hello
        self.dash = '-'  # 短线 用于 ->
        self.sharp = '#'  # 井号 用于处理 #include #DEFINE
        self.dot = '.'  # . 用于处理 小数 和 对象
        self.left_slash = '/'  # / 用于注释
        self.single_quote = '\''  # 单引号
        self.double_quote = '\"'  # 双引号
        self.slash = '\\'  # \ 用于转义
        self.digits = list(string.digits)  # 0~9数字
        self.tab = '\t'  # 制表符 缩进
        self.enter = '\n'  # 回车换行
        self.single_op = ["%", "!", "^", '&', '|']  # 第一类操作符 包含 && 和 || 拆分成单个渲染即可
        self.double_op = ['+', '-', '=', '>', '<', '*', ':']  # 第二类操作符 有可能两个相同 or <= >= += -=
        # 存储结果
        self.bottle = []

(四)源代码

包含详细注释,如有问题可随时联系。

# 词法分析【不断扩充完善】
def analyze(self):
    self.furnace = ''
    # 蟒生污点 不能修改循环遍历 自建指针
    pointer = -1  # pointer初始化为0时 第一个字符无法读取 所以初始化为-1
    for i in range(len(self.raw)):
        # 判断循环变量与指针的位置
        if i <= pointer:
            continue
        else:
            # 读入空格 继续
            if self.raw[i] == ' ':
                continue
            # 读入标识符组成元素或# 判断:
            # 1、关键字
            # 2、#include #define 等
            # 3、标识符
            # 4、对象类
            elif self.is_letter(self.raw[i]) or self.is_underline(self.raw[i]) or self.is_sharp(self.raw[i]):
                # 读入该字符 移动指针
                self.furnace += self.raw[i]
                i += 1
                pointer = i
                # 使用 while 读完整个单词和下一位 然后回退一位
                # #include 等只能以#打头 中间不允许出现 所以这里不包含self.is_sharp(self.raw[i])
                while self.is_letter(self.raw[i]) or self.is_underline(self.raw[i]) or self.is_digit(self.raw[i]):
                    self.furnace += self.raw[i]
                    # 修改临时循环变量通过 while 读取下一个字符
                    i += 1
                    # 修改自定义指针
                    pointer = i
                # 【重点】多读一位 -> 回退
                pointer -= 1
                # 1、关键字
                if self.furnace in self.keywords:
                    # print("keyword: ", self.furnace)
                    self.bottle.append({"category": "keyword", "value": self.furnace})
                # 2、#include #DEFINE等
                elif self.is_sharp(self.furnace[0]):
                    # print("sharpe-special", self.furnace)
                    self.bottle.append({"category": "sharpe-special", "value": self.furnace})
                # 3、标识符
                else:
                    # print("word is: ", self.furnace)
                    self.bottle.append({"category": "word", "value": self.furnace})
                self.furnace = ''
                continue
            # 读取为分隔符
            elif self.is_separator(self.raw[i]):
                self.furnace = self.raw[i]
                # print("separator is: ", self.furnace)
                self.bottle.append({"category": "separator", "value": self.furnace})
                self.furnace = ''
                continue
            # 读入- 判断->
            elif self.is_dash(self.raw[i]):
                self.furnace = self.raw[i]
                # -> 有且只有两位
                # 所以预读一位 不改变pointer 不需要回退
                if self.raw[i + 1] == '>':
                    self.furnace += self.raw[i + 1]
                    pointer = i + 1
                    # print("arrow-special ", self.furnace)
                    self.bottle.append({"category": "arrow-special", "value": self.furnace})
                    self.furnace = ''
                    continue
                elif self.raw[i + 1] == '-':
                    self.furnace += self.raw[i + 1]
                    pointer = i + 1
                    # print("arrow-special ", self.furnace)
                    self.bottle.append({"category": "double-op", "value": self.furnace})
                    self.furnace = ''
                    continue
                else:
                    # print("single-op ", self.furnace)
                    self.bottle.append({"category": "single-op", "value": self.furnace})
                    self.furnace = ''
                    continue
            # 读入数字
            elif self.is_digit(self.raw[i]):
                dot_legal = True
                # 读入该字符 移动指针
                self.furnace += self.raw[i]
                i += 1
                pointer = i
                # 使用 while 读完整个number 然后回退一位
                while self.is_digit(self.raw[i]) or self.is_dot(self.raw[i]):
                    # 判断小数点 并判断是否合法
                    if self.is_dot(self.raw[i]):
                        if dot_legal:
                            dot_legal = False
                        else:
                            # 两个小数点 非法 退出循环 直接回退一位
                            break
                    self.furnace += self.raw[i]
                    # 修改临时循环变量通过 while 读取下一个字符
                    i += 1
                    # 修改自定义指针
                    pointer = i
                # 【重点】多读一位 -> 回退
                pointer -= 1
                # print("number: ", self.furnace)
                self.bottle.append({"category": "number", "value": self.furnace})
                self.furnace = ''
                continue
            # 读入制表符
            elif self.is_tab(self.raw[i]):
                # self.bottle.append({"category": "tab", "value": self.raw[i]})
                self.bottle.append({"category": "tab", "value": "&emsp;"})
                # print("tab: \\t")
            # 读入换行
            elif self.is_enter(self.raw[i]):
                # self.bottle.append({"category": "enter", "value": self.raw[i]})
                self.bottle.append({"category": "enter", "value": '<br>'})
                # print("enter: \\n")
            # 读入 / 开始判断注释
            elif self.is_left_slash(self.raw[i]):
                self.furnace += self.raw[i]
                i += 1
                pointer = i
                # 多行注释
                if self.raw[i] == '*':
                    self.furnace += self.raw[i]
                    while not ((self.is_left_slash(self.raw[i])) and (self.raw[i - 1] == '*')):
                        i += 1
                        pointer = i
                        self.furnace += self.trans(self.raw[i])
                # 单行注释
                elif self.is_left_slash(self.raw[i]):
                    self.furnace += self.raw[i]
                    while not self.is_enter(self.raw[i]):
                        i += 1
                        pointer = i
                        self.furnace += self.trans(self.raw[i])
                # 其它则为除号
                else:
                    # print("single operator: ", self.furnace)
                    pointer -= 1
                    self.bottle.append({"category": "single-op", "value": self.furnace})
                    self.furnace = ''
                    continue
                # print("comment: ", self.furnace)
                self.bottle.append({"category": "comment", "value": self.furnace})
                self.furnace = ''
                continue
            # 单引号字符串
            elif self.is_single_quote(self.raw[i]):
                self.furnace += self.raw[i]
                i += 1
                pointer = i
                while not self.is_single_quote(self.raw[i]):
                    # 转义字符
                    if self.is_slash(self.raw[i]):
                        self.furnace += "<strong class=\"escape\">" + self.raw[i] + self.raw[i + 1] + "</strong>"
                        i += 2
                        pointer = i
                    else:
                        self.furnace += self.raw[i]
                        i += 1
                        pointer = i
                self.furnace += self.raw[i]
                # print("string1: ", self.furnace)
                self.bottle.append({"category": "string", "value": self.furnace})
                self.furnace = ''
            # 双引号字符串
            elif self.is_double_quote(self.raw[i]):
                self.furnace += self.raw[i]
                i += 1
                pointer = i
                while not self.is_double_quote(self.raw[i]):
                    # 转义字符
                    if self.is_slash(self.raw[i]):
                        self.furnace += "<strong class=\"escape\">" + self.raw[i] + self.raw[i + 1] + "</strong>"
                        i += 2
                        pointer = i
                    else:
                        self.furnace += self.raw[i]
                        i += 1
                        pointer = i
                self.furnace += self.raw[i]
                # print("string2: ", self.furnace)
                self.bottle.append({"category": "string", "value": self.furnace})
                self.furnace = ''
            # 运算符1
            elif self.is_single_op(self.raw[i]):
                self.furnace = self.raw[i]
                # print("single-op: ", self.furnace)
                self.bottle.append({"category": "single-op", "value": self.furnace})
                self.furnace = ''
            # 运算符2
            elif self.is_double_op(self.raw[i]):
                if (self.raw[i + 1] == self.raw[i]) or (self.raw[i + 1] == '='):
                    self.furnace = self.raw[i] + self.raw[i + 1]
                    i += 2
                    pointer = i
                    # print("double operator: ", self.furnace)
                    self.bottle.append({"category": "double-op", "value": self.furnace})
                    self.furnace = ''
                else:
                    self.furnace = self.raw[i]
                    # print("single operator: ", self.furnace)
                    self.bottle.append({"category": "single-op", "value": self.furnace})
                    self.furnace = ''
            # 读入其他 非法
            else:
                # print("illegal identifier: ", self.raw[i])
                self.bottle.append({"category": "illegal", "value": self.raw[i]})

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 类似资料: