发表于 2018-5-28 15:42

申请会员ID: 菩提叶

1.申 请 I D:菩提叶
2.个人邮箱:44255977@qq.com
3.技术文章: 70后程序猿一枚,擅长c\c++,delphi,php等,多数代码都代码都已经商业化了,不太好拿出来说事,就拿个十几年前随手写的字符串查找替换小工具来说说吧。
开发工具:delphi6(验证版编译ide是d2009)
需求:实现搜索包含指定文本的文件。(支持多字节字符)
分析:1、由于要支持多字节字符,所以首先要解决的问题是源和目标内容的编码方式必须统一,否则就谈不是比较匹配了,于是写了以下方法来判断并根据编码方式读取文件内容。
type
TTextFormat=(tfAnsi,tfUnicode,tfUnicodeBigEndian,tfUtf8); //定义4种主流编码方式,基本上也就这些了,其它的都是特殊格式,也读不出来,比如word文档


const
TextFormatFlag:array of word=($0000,$FFFE,$FEFF,$EFBB);


function WordLoHiExchange(w:Word):Word;register;
asm
XCHG AL, AH
end;


procedure ReadTextFile(const FileName: string;
var TextFormat: TTextFormat; var sText:string);
var
w:Word;
b:Byte;
begin
with TFileStream.Create(FileName,fmOpenRead or fmShareDenyNone) do
try
   Read(w,2);
   w:=WordLoHiExchange(w);//因为是以Word数据类型读取,故高低字节互换
   if w = TextFormatFlag then
   TextFormat:= tfUnicode
   else if w = TextFormatFlag then
   TextFormat:= tfUnicodeBigEndian
   else if w = TextFormatFlag then
   begin
   Read(b,1);//这里要注意一下,UFT-8必须要跳过三个字节。
   TextFormat:=tfUtf8;
   end else
   begin
   TextFormat:=tfANSI;
   Position:=0;
   end;
   SetLength(sText,Size-Position);
   ReadBuffer(sText,Size-Position);
finally
   Free;
end;
end;


能识别和读取了,那么接下来还得把源和待匹配的内容都转为一种编码才能开始匹配,于是又增加以下函数来实现各编码的转换
function StringToWideStringEx(const S: AnsiString; CodePage: Word): WideString;
var
L: Integer;
begin
L:= MultiByteToWideChar(CodePage, 0, PAnsiChar(S), -1, nil, 0);
SetLength(Result, L-1);
MultiByteToWideChar(CodePage, 0, PAnsiChar(S), -1, PWideChar(Result), L - 1);
end;
//---------------------------------------------------------------------
function WideStringToStringEx(const WS: WideString; CodePage: Word): String;
var
L: Integer;
begin
L := WideCharToMultiByte(CodePage, 0, PWideChar(WS), -1, nil, 0, nil, nil);
SetLength(Result, L-1);
WideCharToMultiByte(CodePage, 0, PWideChar(WS), -1, PAnsiChar(Result), L - 1, nil, nil);
end;


function Converstr(s:WideString):WideString;
var
w:Word;
c:WideChar;
i:Integer;
tmp:WideString;
begin
for i:=1 to Length(s) do
begin
    c:=s;
    w:=word(c);
    w:=WordLoHiExchange(w);
    c:=WideChar(w);
    tmp:=tmp+string(c);
end;
Result:=tmp;
end;


function AnsiToUnicode(Ansi: string):string;
var
s:string;
i, slen:integer;
j:string;
begin
s:='';
sLen := length(Ansi);
for i := 1 to sLen do
begin
    if i mod 2 = 1 then
      j:=IntToHex(word(ansi) shl 8,2)
    else j:=IntToHex(word(ansi),2);
    s:=s+j;
end;
Result :=s;
end;


function HexToInt(hex: char): integer;
begin
if (hex >='1') and (hex <= '9') then result := strToInt(hex)
else if((hex >= 'a') and (hex <= 'f')) then result := integer(hex)- integer('a') + 10
else if (hex >= 'A') and (hex <= 'F') then result := integer(hex)- integer('A') + 10
else result := 0;
end;


function UnicodeToAnsi(Unicode: string):string;
var
s:string;
i, slen, m, n:integer;
begin
s:='';
sLen := length(Unicode) div 2 ;
for i := 1 to sLen do
begin
    m := HexToInt(Unicode);
    n := HexToInt(Unicode);
    s := s+chr(m *16 +n);
end;
Result :=s;
end;


2、内容匹配方面,最主要考虑效率问题,经过测试后决定采用以下算法,原算法出自于一个老外,我只是做了些小修改,使之能够用于匹配中文等多字节字符。
为什么要传源长度和匹配长度呢?也是为了效率,不用每次都计算下长度。为什么使用汇编,也是效率问题,原本delphi本身有提供一个pos函数,也是汇编写的,但算法上不够理想,所以使用了其它方法
function FastPos(
const aSourceString, aFindString : String;
const aSourceLen, aFindLen, StartPos : integer
) : integer;
begin
// Next, we determine how many bytes we need to
// scan to find the "start" of aFindString.
// Remove by Terry Tang
{
SourceLen := aSourceLen;
SourceLen := SourceLen - aFindLen;
if (StartPos-1) > SourceLen then begin
    Result := 0;
    Exit;
end;
SourceLen := SourceLen - StartPos;
SourceLen := SourceLen +2;
}
// Remove end


// The ASM starts here.
asm
    // Delphi uses ESI, EDI, and EBX a lot,
    // so we must preserve them.
    push ESI
    push EDI
    push EBX


// Add by Terry Tang
    Mov ECX, aSourceLen
    Mov EAX, aFindLen
    Sub ECX, EAX
    JL@Result0
    Mov EAX, StartPos
    Dec EAX
    Sub ECX, EAX
    JL@Result0
    Inc ECX
// Add end


    // Get the address of sourceString
    // and Add (StartPos-1).
    // We do this for the purpose of finding
    // the NEXT occurrence, rather than
    // always the first!
    mov EDI, aSourceString
    add EDI, StartPos
    Dec EDI
    // Get the address of aFindString.
    mov ESI, aFindString
    // Note how many bytes we need to
    // look through in aSourceString
    // to find aFindString.


// Remove by Terry Tang
//    mov ECX, SourceLen
// Remove end


    // Get the first char of aFindString;
    // note how it is done outside of the
    // main loop, as it never changes!
    MovAl,
    // Now the FindFirstCharacter loop!
    @ScaSB:
    // Get the value of the current
    // character in aSourceString.
    // This is equal to ah := EDI^, that
    // is what the [] are around .
    MovAh,
    // Compare this character with aDestString.
    cmpAh,Al
    // If they're not equal we don't
    // compare the strings.
    jne@NextChar
    // If they're equal, obviously we do!
    @CompareStrings:
    // Put the length of aFindLen in EBX.
    movEBX, aFindLen
    // We DEC EBX to point to the end of
    // the string; that is, we don't want to
    // add 1 if aFindString is 1 in length!
    decEBX


    // add by ShengQuanhu
    // If EBX is zero, then we've successfully
    // compared each character; i.e. it's A MATCH!
    // It will be happened when aFindLen=1
    Jz @EndOfMatch
    //add end


//Here’s another optimization tip. People at this point usually PUSH ESI and
//so on and then POP ESI and so forth at the end–instead, I opted not to chan
//ge ESI and so on at all. This saves lots of pushing and popping!
    @CompareNext:
    // Get aFindString character +
    // aFindStringLength (the last char).
    movAl,
    // Get aSourceString character (current
    // position + aFindStringLength).
    movAh,
    // Compare them.
    cmpAl, Ah
    Jz   @Matches
    // If they don't match, we put the first char
    // of aFindString into Al again to continue
    // looking for the first character.
    MovAl,
    Jmp@NextChar
    @Matches:
    // If they match, we DEC EBX (point to
    // previous character to compare).
    DecEBX
    // If EBX <> 0 ("J"ump "N"ot "Z"ero), we
    // continue comparing strings.
    Jnz@CompareNext


    //add by Shengquanhu
    @EndOfMatch:
    //add end


    // If EBX is zero, then we've successfully
    // compared each character; i.e. it's A MATCH!
    // Move the address of the *current*
    // character in EDI.
    // Note, we haven't altered EDI since
    // the first char was found.
    movEAX, EDI
    // This is an address, so subtract the
    // address of aSourceString to get
    // an actual character position.
    subEAX, aSourceString
    // Inc EAX to make it 1-based,
    // rather than 0-based.
    incEAX
    // Put it into result.
    movResult, EAX
    // Finish this routine!
    jmp@TheEnd
    @NextChar:
//This is where I jump to when I want to continue searching for the first char
//acter of aFindString in aSearchString:
    // Point EDI (aFindString) to
    // the next character.
    MovAh, //先把第一个字符移到Ah中,后面判断是否中文
    IncEDI
    // Dec ECX tells us that we've checked
    // another character, and that we're
    // fast running out of string to check!
    decECX
    // If EBX <> 0, then continue scanning
    // for the first character.


    //add by shengquanhu
    //if ah is chinese char,jump again
    jz   @Result0


    cmpah, $80
    jb   @ScaSB
    IncEDI
    DecECX
    //add by shengquanhu end


    jnz@ScaSB


    //add by shengquanhu
    @Result0:
    //add by shengquanhu end


    // If EBX = 0, then move 0 into RESULT.
    movResult,0
    // Restore EBX, EDI, ESI for Delphi
    // to work correctly.
    // Note that they're POPped in the
    // opposite order they were PUSHed.
    @TheEnd:
    popEBX
    popEDI
    popESI


end;
end;
另外稍加修改可以适应不区分大小写的情况,代码就不贴出来了。


3、文件搜索,这方面就是调用findfirstfile和findnextfile来检索文件,基本上没什么技术含量,就是要注意一些细节问题,比如文件是否被占用,目标文件是否需要分段读取,由于和界面通讯还得注意下线程安全问题。核心代码如下:


procedure TSearchDirThread.SearchDir(DirName: string);
var
DSearchRec: TSearchRec;
ZSearchRec: TSearchRec;
TmpFile: TStringStream;
f1: TextFile;
f:TFileStream;
soureHX:TMemoryStream;
FindResult, Findsize: integer;
filename, str:string;
astr: AnsiString;
tmp:PWideChar;
CodeType:TTextFormat;
NoCase, founded:Boolean;   //区分大小写


function IsDirNotation(ADirName: String): Boolean;
begin
    Result := (ADirName = '.') or (ADirName = '..');
end;


begin
while onoff do
begin
    exit ;
end;
DirName := GetDirectoryName(DirName);
FindResult := FindFirst(DirName+ftp,faAnyFile,DSearchRec);
try
    while FindResult = 0 do
    begin
      filename:=Trim(DirName+DSearchRec.Name);
      if ((DSearchRec.Attr and faDirectory) <> faDirectory) and (Pos('hiberfil.sys',filename)=0) then
      begin
      EnterCriticalSection(CS);
      Frm_main.stat1.Panels.Text:=filename;
      if FileStatus(filename) then      //先判断文件是否使用中
      begin
          if not Rpl then
          begin
            ReadTextFile(filename,CodeType,str);
            if CodeType=tfAnsi then
            begin
            f:=TFileStream.Create(filename,fmOpenRead or fmShareDenyNone);
            SetLength(astr,f.size+1);
            f.ReadBuffer(astr,f.Size);
            f.Free;
            str:=astr;
      //      TmpFile:=TStringStream.Create('',TEncoding.ASCII);
      //      TmpFile.WriteBuffer(str,Length(str));
      //      astr:=TmpFile.DataString;
            end
            else
            if CodeType=tfUtf8 then
            begin
            f:=TFileStream.Create(filename,fmOpenRead or fmShareDenyNone);
            SetLength(astr,f.size-3);
            f.Position:=3;
            f.ReadBuffer(astr,f.Size-3);
            f.Free;
            str:=UTF8Decode(astr);
            end
            else if (CodeType=tfUnicode) or (CodeType=tfUnicodeBigEndian) then
            str:=GetFileText(filename);


            if CodeType=tfUnicodeBigEndian then
            begin
               FindStr:=ConverStr(Frm_main.FindString.Text);
               RepStr:=ConverStr(Frm_main.rpstring.Text);
            end
            else begin
               FindStr:=Frm_main.FindString.Text;
               RepStr:=Frm_main.rpstring.Text;
            end;
          end
          else begin
            f:=TFileStream.Create(filename,fmOpenRead or fmShareDenyNone);
            SetLength(str,f.size+1);
            f.ReadBuffer(str,f.Size);
            f.Free;
            FindStr:=Frm_main.FindString.Text;
            RepStr:=Frm_main.rpstring.Text;
          end;


          Findsize:=Length(str);
          NoCase:= not Frm_main.chk1.Checked;


          if not Rpl then
          begin
            if NoCase then
            begin
            if FastPosNoCase(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
                founded:=True
            else
                founded:=False;
            end
            else begin
            if FastPos(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
                founded:=True
            else
                founded:=False;
            end;
            if founded then
            begin
            Inc(Fcut);
            Frm_Main.lv1.Items.Add.Caption:=LowerCase(DSearchRec.Name);
            Frm_Main.lv1.Items.Item.SubItems.Add(DirName);
            end;
          end
          else begin
            if NoCase then
            begin
            if FastPosNoCase(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
                founded:=True
            else
                founded:=False;
            end
            else begin
            if FastPos(str,FindStr,Findsize,Length(FindStr),1)<> 0 then
                founded:=True
            else
                founded:=False;
            end;
            if founded then
            begin
            Inc(Fcut);


            Frm_Main.lv1.Items.Add.Caption:=LowerCase(DSearchRec.Name);
            Frm_Main.lv1.Items.Item.SubItems.Add(DirName);
            str:=FastReplace(str,FindStr,RepStr,False);
            soureHX:=TMemoryStream.Create;
             { if CodeType=tfUtf8 then
                str:=UTF8Encode(widestring(str))
            else if (CodeType=tfUnicode) then
                str:=AnsiToUnicode(str)
            else if (CodeType=tfUnicodeBigEndian) then
                str:=AnsiToUnicode(str);   }


            soureHX.Write(str,Length(str));
            soureHX.SaveToFile(filename);
            soureHX.Free;
            end;
          end;
      end;
      LeaveCriticalSection(CS);
      end;
      FindResult := FindNext(DSearchRec);
    end;
finally
    FindClose(DSearchRec);
end;
FindResult := FindFirst(DirName+'*.*', faDirectory+faHidden, zSearchRec);
try
while FindResult = 0 do
begin
    if ((ZSearchRec.Attr and faDirectory) = faDirectory) and not
      IsDirNotation(zSearchRec.Name) then
      Searchdir(DirName+zSearchRec.Name);
    FindResult := FindNext(zSearchRec);
end;
finally
FindClose(zSearchRec);
end;
end;

4,最后就是界面问题,一个好的界面并不一定多华丽,但要让用户用起来没难度,所以就简单易用的原则搞了下,做了一些体验上的优化,如记录最近搜索过的路径,关键字自动完成等。
实际上目前还能baidu搜索到该小工具。这个小工具最大的特点就是快,有兴趣的话可以对比下同类工具,我估计在同类工具里应该是最快的(多年前我测试过,利益于高效的匹配算法,效率方面确实是最高的,十几年过去了,也不知道新的产品有没有改进)。
此次为了验证,特别重新编译了一个,在界面上增加了一个验证按钮,如图。
https://www.52pojie.cn/forum.php?mod=image&aid=1143897&size=300x300&key=a504db5d4afa0be8&nocache=yes&type=贴图错误,请阅读“贴图帮助”。


成品下载地址: https://pan.baidu.com/s/1KE1Me66wpumfJffrVnq1CQ

Hmily 发表于 2018-5-30 17:02

还是有些普通,请补充更多技术分析吧。

danao 发表于 2018-10-20 11:33

看着好厉害
页: [1]
查看完整版本: 申请会员ID: 菩提叶