From what you've described, you'll have to read the first line of every file to organize them by identifiers. Something like this I think would do what you're looking for:
import os
import collections
import random
import shutil
def get_identifier(path):
with open(path) as fd:
return fd.readline().strip() #assuming you don't want the \n in the identifier
paths = ['/home/file1', '/home/file2', '/home/file3']
destination_dir = '/tmp'
identifiers = collections.defaultdict(list)
for path in paths:
identifier = get_identifier(path)
identifiers[identifier].append(path)
for identifier, paths in identifiers.items():
sample = random.sample(paths, 500)
for path in sample:
file_name = os.path.basename(path)
destination = os.path.join(destination_dir, file_name)
shutil.copy(path, destination)